diff --git a/debian/control b/debian/control index bb38046c738ec..ba57836a894e6 100644 --- a/debian/control +++ b/debian/control @@ -985,6 +985,18 @@ Description: Amazon S3 archival storage engine for MariaDB third-party public or private cloud that implements S3 API), but still have them accessible in MariaDB in read-only mode. +Package: mariadb-plugin-tidesdb +Architecture: any +Depends: mariadb-server (= ${server:Version}), + ${misc:Depends}, + ${shlibs:Depends} +Description: TidesDB storage engine for MariaDB server + TidesDB is an LSM B+-tree storage engine with ACID transactions, MVCC, + configurable compression (zstd/lz4/snappy), per-row and table TTL, + full-text, spatial and vector indexes, and an optional S3-compatible + object store backend. + This package contains the TidesDB plugin for MariaDB server. + Package: mariadb-plugin-rocksdb Architecture: amd64 arm64 mips64el ppc64el riscv64 Depends: mariadb-server (= ${server:Version}), diff --git a/debian/mariadb-plugin-tidesdb.install b/debian/mariadb-plugin-tidesdb.install new file mode 100644 index 0000000000000..b7c13c10d854b --- /dev/null +++ b/debian/mariadb-plugin-tidesdb.install @@ -0,0 +1,2 @@ +etc/mysql/mariadb.conf.d/tidesdb.cnf +usr/lib/mysql/plugin/ha_tidesdb.so diff --git a/mysql-test/suite/tidesdb/include/cleanup_tidesdb.inc b/mysql-test/suite/tidesdb/include/cleanup_tidesdb.inc new file mode 100644 index 0000000000000..52dc8aeef4689 --- /dev/null +++ b/mysql-test/suite/tidesdb/include/cleanup_tidesdb.inc @@ -0,0 +1,3 @@ +disable_query_log; +ALTER DATABASE test DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_uca1400_ai_ci; +enable_query_log; diff --git a/mysql-test/suite/tidesdb/include/have_tidesdb.inc b/mysql-test/suite/tidesdb/include/have_tidesdb.inc new file mode 100644 index 0000000000000..353ae597297a9 --- /dev/null +++ b/mysql-test/suite/tidesdb/include/have_tidesdb.inc @@ -0,0 +1,7 @@ +disable_query_log; +--error 0,1286 +eval SET @@default_storage_engine = TidesDB; +ALTER DATABASE test DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci; +call mtr.add_suppression("Plugin 'TIDESDB' is of maturity level gamma"); +call mtr.add_suppression("TIDESDB: hton_commit: tidesdb_txn_commit returned"); +enable_query_log; diff --git a/mysql-test/suite/tidesdb/include/have_tidesdb_vector.inc b/mysql-test/suite/tidesdb/include/have_tidesdb_vector.inc new file mode 100644 index 0000000000000..a29ee8a038fc8 --- /dev/null +++ b/mysql-test/suite/tidesdb/include/have_tidesdb_vector.inc @@ -0,0 +1,15 @@ +# Skip the test unless the server supports the VECTOR data type (MariaDB 11.7+). +# TidesDB itself implements vector indexes, but the SQL-layer VECTOR type does +# not exist on older servers, so CREATE TABLE ... VECTOR fails there. +--disable_query_log +--disable_warnings +--error 0,ER_UNKNOWN_DATA_TYPE,ER_PARSE_ERROR,ER_NOT_SUPPORTED_YET +CREATE TEMPORARY TABLE tdb_vector_probe (v VECTOR(4) NOT NULL); +let $tdb_no_vector= $mysql_errno; +DROP TEMPORARY TABLE IF EXISTS tdb_vector_probe; +--enable_warnings +--enable_query_log +if ($tdb_no_vector) +{ + --skip TidesDB: VECTOR data type not supported (requires MariaDB 11.7+) +} diff --git a/mysql-test/suite/tidesdb/r/tidesdb_alter_large_table.result b/mysql-test/suite/tidesdb/r/tidesdb_alter_large_table.result new file mode 100644 index 0000000000000..b0ede22e93ccb --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_alter_large_table.result @@ -0,0 +1,54 @@ +# +# Large-table ALTER under REPEATABLE_READ. +# +# Copy-phase ALTER scans every row of the source table into the +# rebuilt table while a single REPEATABLE_READ transaction is open +# (autocommit=0 forces this), so the engine must keep the read-set +# bookkeeping bounded as the scan grows. Unbounded growth here +# used to crash the server inside tidesdb_txn_add_to_read_set. +# The test asserts that the scan completes, the rebuild commits, +# and the row count is preserved. +# +CREATE TABLE t_alter_big ( +a INT AUTO_INCREMENT PRIMARY KEY, +b INT +) ENGINE=TidesDB; +INSERT INTO t_alter_big (a, b) VALUES (DEFAULT, 10), (DEFAULT, 20), (DEFAULT, 30); +# Double the rows repeatedly to get ~100K rows +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; +SELECT COUNT(*) FROM t_alter_big; +COUNT(*) +98304 +# autocommit=0 makes the surrounding session use REPEATABLE_READ, +# which is the isolation that loaded the read-set during ALTER. +SET autocommit=0; +# Sanity-check ALTER's error reporting on contradictory key DDL. +ALTER TABLE t_alter_big ADD PRIMARY KEY (a); +ERROR 42000: Multiple primary key defined +ALTER TABLE t_alter_big DROP PRIMARY KEY; +ERROR 42000: Incorrect table definition; there can be only one auto column and it must be defined as a key +# Copy-based ALTER over ~100K rows under REPEATABLE_READ. Must +# complete cleanly without exhausting memory or crashing the +# server in the read-set machinery. +ALTER TABLE t_alter_big DROP PRIMARY KEY, CHANGE a a INT; +SELECT COUNT(*) FROM t_alter_big; +COUNT(*) +98304 +SET autocommit=1; +DROP TABLE t_alter_big; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_analyze.result b/mysql-test/suite/tidesdb/r/tidesdb_analyze.result new file mode 100644 index 0000000000000..efb6a0e3c25f3 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_analyze.result @@ -0,0 +1,47 @@ +# +# ANALYZE TABLE for TidesDB -- verifies CF stats output +# +CREATE TABLE t1 ( +id INT PRIMARY KEY, +val VARCHAR(40), +KEY idx_val (val) +) ENGINE=TidesDB; +INSERT INTO t1 VALUES (1, 'alpha'), (2, 'bravo'), (3, 'charlie'), +(4, 'delta'), (5, 'echo'), (6, 'foxtrot'); +# ANALYZE TABLE should return status OK and emit CF stats as notes. +# Mask volatile numeric values (memtable size, avg sizes, etc.) +ANALYZE TABLE t1; +Table Op Msg_type Msg_text +test.t1 analyze status Engine-independent statistics collected +test.t1 analyze Note [TIDESDB] CF 'test__t1' total_keys=N data_size=N bytes memtable=N bytes levels=1 read_amp=N cache_hit=N% +test.t1 analyze Note [TIDESDB] avg_key=N bytes avg_value=N bytes +test.t1 analyze Note [TIDESDB] level 1 sstables=N size=N bytes keys=N +test.t1 analyze Note [TIDESDB] idx CF 'test__t1__idx_idx_val' keys=N data_size=N bytes levels=1 +test.t1 analyze Note [TIDESDB] idx 'idx_val' sampled=6 distinct=6 rec_per_key=1 +test.t1 analyze status OK +# ANALYZE a table without secondary indexes +CREATE TABLE t2 ( +id INT PRIMARY KEY, +data VARCHAR(200) +) ENGINE=TidesDB; +INSERT INTO t2 VALUES (1, REPEAT('x', 100)), (2, REPEAT('y', 100)); +ANALYZE TABLE t2; +Table Op Msg_type Msg_text +test.t2 analyze status Engine-independent statistics collected +test.t2 analyze Note [TIDESDB] CF 'test__t2' total_keys=N data_size=N bytes memtable=N bytes levels=1 read_amp=N cache_hit=N% +test.t2 analyze Note [TIDESDB] avg_key=N bytes avg_value=N bytes +test.t2 analyze Note [TIDESDB] level 1 sstables=N size=N bytes keys=N +test.t2 analyze status OK +# ANALYZE an empty table +CREATE TABLE t3 ( +id INT PRIMARY KEY +) ENGINE=TidesDB; +ANALYZE TABLE t3; +Table Op Msg_type Msg_text +test.t3 analyze status Engine-independent statistics collected +test.t3 analyze Note [TIDESDB] CF 'test__t3' total_keys=N data_size=N bytes memtable=N bytes levels=1 read_amp=N cache_hit=N% +test.t3 analyze Note [TIDESDB] avg_key=N bytes avg_value=N bytes +test.t3 analyze Note [TIDESDB] level 1 sstables=N size=N bytes keys=N +test.t3 analyze status OK +# Cleanup +DROP TABLE t1, t2, t3; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_auto_increment.result b/mysql-test/suite/tidesdb/r/tidesdb_auto_increment.result new file mode 100644 index 0000000000000..df9e3826d5f94 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_auto_increment.result @@ -0,0 +1,112 @@ +# +# TEST 1: Basic auto-increment +# +CREATE TABLE t_ai (id INT AUTO_INCREMENT PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_ai (v) VALUES ('a'), ('b'), ('c'); +SELECT * FROM t_ai ORDER BY id; +id v +1 a +2 b +3 c +# +# TEST 2: Explicit value larger than counter +# +INSERT INTO t_ai VALUES (100, 'explicit'); +INSERT INTO t_ai (v) VALUES ('after_explicit'); +SELECT * FROM t_ai ORDER BY id; +id v +1 a +2 b +3 c +100 explicit +101 after_explicit +# +# TEST 3: Gap after rollback +# +BEGIN; +INSERT INTO t_ai (v) VALUES ('will_rollback'); +SELECT MAX(id) FROM t_ai; +MAX(id) +102 +ROLLBACK; +INSERT INTO t_ai (v) VALUES ('after_rollback'); +SELECT id, v FROM t_ai WHERE v IN ('after_rollback', 'after_explicit') ORDER BY id; +id v +101 after_explicit +103 after_rollback +# +# TEST 4: LAST_INSERT_ID +# +INSERT INTO t_ai (v) VALUES ('last_id_test'); +SELECT LAST_INSERT_ID() > 0 AS has_last_id; +has_last_id +1 +# +# TEST 5: Auto-increment with REPLACE INTO +# +CREATE TABLE t_ai_replace ( +id INT AUTO_INCREMENT PRIMARY KEY, +name VARCHAR(50) UNIQUE +) ENGINE=TidesDB; +INSERT INTO t_ai_replace (name) VALUES ('x'), ('y'), ('z'); +REPLACE INTO t_ai_replace (name) VALUES ('y'); +SELECT * FROM t_ai_replace ORDER BY name; +id name +1 x +4 y +3 z +# +# TEST 5b: an auto-increment PK must not bypass the UNIQUE secondary check +# +INSERT INTO t_ai_replace (name) VALUES ('z'); +ERROR 23000: Duplicate entry 'z' for key 'name' +INSERT INTO t_ai_replace (name) VALUES ('x') +ON DUPLICATE KEY UPDATE name = 'x2'; +SELECT * FROM t_ai_replace ORDER BY name; +id name +1 x2 +4 y +3 z +# no value may appear twice in the UNIQUE column +SELECT name, COUNT(*) AS c FROM t_ai_replace GROUP BY name HAVING c > 1; +name c +# +# TEST 6: BIGINT auto-increment +# +CREATE TABLE t_ai_big (id BIGINT AUTO_INCREMENT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_ai_big (v) VALUES (1), (2), (3); +INSERT INTO t_ai_big VALUES (9999999999, 4); +INSERT INTO t_ai_big (v) VALUES (5); +SELECT * FROM t_ai_big ORDER BY id; +id v +1 1 +2 2 +3 3 +9999999999 4 +10000000000 5 +# +# TEST 7: Auto-increment after TRUNCATE resets counter +# +TRUNCATE TABLE t_ai; +INSERT INTO t_ai (v) VALUES ('fresh_start'); +SELECT * FROM t_ai; +id v +1 fresh_start +# +# TEST 8: ALTER TABLE ... AUTO_INCREMENT=N takes effect +# +CREATE TABLE t_ai_alter (id INT AUTO_INCREMENT PRIMARY KEY, v VARCHAR(10)) ENGINE=TidesDB; +INSERT INTO t_ai_alter (v) VALUES ('a'), ('b'); +ALTER TABLE t_ai_alter AUTO_INCREMENT=1000; +INSERT INTO t_ai_alter (v) VALUES ('jumped'); +SELECT * FROM t_ai_alter ORDER BY id; +id v +1 a +2 b +3 jumped +DROP TABLE t_ai_alter; +# +# Cleanup +# +DROP TABLE t_ai, t_ai_replace, t_ai_big; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_backup.result b/mysql-test/suite/tidesdb/r/tidesdb_backup.result new file mode 100644 index 0000000000000..01970d5438b55 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_backup.result @@ -0,0 +1,73 @@ +CALL mtr.add_suppression("\\[TIDESDB\\] Backup to .* failed"); +# +# ============================================ +# TEST 1: Online backup creates a valid copy +# ============================================ +# +CREATE TABLE t_backup ( +id INT PRIMARY KEY, +val VARCHAR(100) +) ENGINE=TIDESDB; +INSERT INTO t_backup VALUES (1, 'alpha'), (2, 'beta'), (3, 'gamma'); +SELECT * FROM t_backup ORDER BY id; +id val +1 alpha +2 beta +3 gamma +# Triggering online backup +# Backup should have created the directory +Backup directory exists: YES +# Check that SHOW VARIABLES reflects the backup path +SELECT @@GLOBAL.tidesdb_backup_dir IS NOT NULL AS backup_dir_set; +backup_dir_set +1 +# Insert more data after backup (should NOT appear in backup) +INSERT INTO t_backup VALUES (4, 'delta'), (5, 'epsilon'); +SELECT COUNT(*) AS rows_after FROM t_backup; +rows_after +5 +DROP TABLE t_backup; +# +# ============================================ +# TEST 2: Backup to existing non-empty dir fails +# ============================================ +# +# Re-running backup to same directory should fail (not empty) +SET GLOBAL tidesdb_backup_dir = 'MYSQLTEST_VARDIR/tmp/tidesdb_backup_test'; +ERROR HY000: [TIDESDB] Backup to 'MYSQLTEST_VARDIR/tmp/tidesdb_backup_test' failed (err=-6) +# +# ============================================ +# TEST 3: Clear backup_dir variable +# ============================================ +# +SET GLOBAL tidesdb_backup_dir = ''; +SELECT @@GLOBAL.tidesdb_backup_dir IS NULL AS backup_dir_cleared; +backup_dir_cleared +1 +# +# ============================================ +# TEST 4: Concurrent reads/writes during backup +# ============================================ +# +CREATE TABLE t_concurrent ( +id INT PRIMARY KEY, +data VARCHAR(200) +) ENGINE=TIDESDB; +# Inserted 100 rows +SELECT COUNT(*) AS before_backup FROM t_concurrent; +before_backup +100 +# Backup completed while table was loaded +SELECT COUNT(*) AS after_backup FROM t_concurrent; +after_backup +100 +INSERT INTO t_concurrent VALUES (101, 'post-backup'); +SELECT COUNT(*) AS with_post_backup FROM t_concurrent; +with_post_backup +101 +DROP TABLE t_concurrent; +# +# === Cleanup === +# +SET GLOBAL tidesdb_backup_dir = ''; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_bulk_commit_durability.result b/mysql-test/suite/tidesdb/r/tidesdb_bulk_commit_durability.result new file mode 100644 index 0000000000000..00025f3ec6273 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_bulk_commit_durability.result @@ -0,0 +1,36 @@ +DROP TABLE IF EXISTS bulk_src; +DROP TABLE IF EXISTS bulk_dst; +CREATE TABLE bulk_src ( +id INT PRIMARY KEY, +payload VARCHAR(200) +) ENGINE=TIDESDB; +CREATE TABLE bulk_dst ( +id INT PRIMARY KEY, +payload VARCHAR(200) +) ENGINE=TIDESDB; +SELECT COUNT(*) AS src_rows FROM bulk_src; +src_rows +1000 +# +# Run 50 bulk INSERT ... SELECT statements (50,000 rows total). +# Each statement crosses the bulk-commit threshold, exercising +# the maybe_bulk_commit() path that previously swallowed errors. +# +# +# Assertion: every row from every batch must be present. If +# maybe_bulk_commit() ever swallows an inner commit failure again, +# this verdict line will read "LOST rows" instead of "OK". +# +SELECT IF(COUNT(*) = 50000, +'OK', +CONCAT('LOST ', 50000 - COUNT(*), ' rows of 50000')) +AS verdict +FROM bulk_dst; +verdict +OK +SELECT COUNT(*) AS dst_rows, MIN(id) AS min_id, MAX(id) AS max_id FROM bulk_dst; +dst_rows min_id max_id +50000 1 50000 +DROP TABLE bulk_src; +DROP TABLE bulk_dst; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_checkpoint.result b/mysql-test/suite/tidesdb/r/tidesdb_checkpoint.result new file mode 100644 index 0000000000000..39e4aae93d515 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_checkpoint.result @@ -0,0 +1,24 @@ +# +# TEST 1: Create checkpoint +# +CREATE TABLE t_ckpt (id INT PRIMARY KEY, val VARCHAR(100)) ENGINE=TidesDB; +INSERT INTO t_ckpt VALUES (1, 'before_checkpoint'), (2, 'data_two'), (3, 'data_three'); +# +# TEST 3: Data survives after checkpoint +# +INSERT INTO t_ckpt VALUES (4, 'after_checkpoint'); +SELECT * FROM t_ckpt ORDER BY id; +id val +1 before_checkpoint +2 data_two +3 data_three +4 after_checkpoint +# +# TEST 4: Clear checkpoint dir variable +# +SET GLOBAL tidesdb_checkpoint_dir = ''; +# +# Cleanup +# +DROP TABLE t_ckpt; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_concurrent_conflict.result b/mysql-test/suite/tidesdb/r/tidesdb_concurrent_conflict.result new file mode 100644 index 0000000000000..ec009e0da42ca --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_concurrent_conflict.result @@ -0,0 +1,58 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +# +# Issue #77: Concurrent conflict detection +# +CREATE TABLE t ( +i INT NOT NULL PRIMARY KEY, +x INT +) ENGINE=TidesDB; +INSERT INTO t VALUES (1,10),(2,20),(3,30),(4,40),(5,50); +connect con1, localhost, root,,; +connect con2, localhost, root,,; +# ---- TEST 1: Two UPDATEs on same row ---- +connection con1; +START TRANSACTION; +UPDATE t SET x = 999 WHERE i = 1; +connection con2; +START TRANSACTION; +UPDATE t SET x = 888 WHERE i = 1; +COMMIT; +connection con1; +COMMIT; +Got one of the listed errors +connection default; +# con2 wins: x should be 888 +SELECT * FROM t WHERE i = 1; +i x +1 888 +# ---- TEST 2: UPDATE vs DELETE on same row ---- +connection con1; +START TRANSACTION; +UPDATE t SET x = 777 WHERE i = 2; +connection con2; +START TRANSACTION; +DELETE FROM t WHERE i = 2; +COMMIT; +connection con1; +COMMIT; +Got one of the listed errors +connection default; +# con2 wins: row 2 should be gone +SELECT * FROM t WHERE i = 2; +i x +# Remaining rows intact +SELECT * FROM t ORDER BY i; +i x +1 888 +3 30 +4 40 +5 50 +# Cleanup +connection con1; +disconnect con1; +connection con2; +disconnect con2; +connection default; +DROP TABLE t; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_concurrent_errors.result b/mysql-test/suite/tidesdb/r/tidesdb_concurrent_errors.result new file mode 100644 index 0000000000000..2d1c25c524978 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_concurrent_errors.result @@ -0,0 +1,105 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_LOCKED"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_MEMORY_LIMIT"); +call mtr.add_suppression("\\[TIDESDB\\].*unexpected TidesDB error"); +# +# === Setup: sysbench-like schema === +# +CREATE TABLE t1 ( +id INT NOT NULL AUTO_INCREMENT, +k INT NOT NULL DEFAULT 0, +c CHAR(120) NOT NULL DEFAULT '', +pad CHAR(60) NOT NULL DEFAULT '', +PRIMARY KEY (id), +KEY k_1 (k) +) ENGINE=TIDESDB SYNC_MODE='NONE'; +Warnings: +Warning 1105 [TIDESDB] Table SYNC_MODE=NONE governs SSTable file sync only. Under tidesdb_unified_memtable=ON the shared WAL is fsynced according to tidesdb_unified_memtable_sync_mode=FULL, so the table option does not change WAL durability for this table +# +# === Populate: 2000 rows === +# +SELECT COUNT(*) AS row_count FROM t1; +row_count +2000 +# +# ============================================ +# TEST 1: Concurrent oltp_read_write pattern +# 4 connections doing BEGIN...COMMIT with +# interleaved reads + writes on overlapping rows. +# Before fix: error 1030 (HA_ERR_GENERIC) +# After fix: error 1213 (deadlock, retryable) +# ============================================ +# +connect c1, localhost, root,,; +connect c2, localhost, root,,; +connect c3, localhost, root,,; +connect c4, localhost, root,,; +# +# === Verify: no error 1030 (HA_ERR_GENERIC) was produced === +# +connection c1; +# c1 error_1030 count: +SELECT @err_1030 AS err_1030_c1; +err_1030_c1 +0 +connection c2; +# c2 error_1030 count: +SELECT @err_1030 AS err_1030_c2; +err_1030_c2 +0 +connection c3; +# c3 error_1030 count: +SELECT @err_1030 AS err_1030_c3; +err_1030_c3 +0 +connection c4; +# c4 error_1030 count: +SELECT @err_1030 AS err_1030_c4; +err_1030_c4 +0 +connection default; +# +# === Verify data integrity (PK count == index count) === +# +Data integrity: OK +# +# ============================================ +# TEST 2: Conflict storm -- all connections hit SAME 3 rows +# Maximizes conflict rate. Before fix these would be +# error 1030; after fix they are error 1213 (retryable). +# ============================================ +# +# +# === Verify: no error 1030 in conflict storm === +# +connection c1; +# c1 error_1030 count: +SELECT @err_1030 AS err_1030_c1; +err_1030_c1 +0 +connection c2; +# c2 error_1030 count: +SELECT @err_1030 AS err_1030_c2; +err_1030_c2 +0 +connection c3; +# c3 error_1030 count: +SELECT @err_1030 AS err_1030_c3; +err_1030_c3 +0 +connection c4; +# c4 error_1030 count: +SELECT @err_1030 AS err_1030_c4; +err_1030_c4 +0 +connection default; +Conflict storm: OK +# +# === Cleanup === +# +disconnect c1; +disconnect c2; +disconnect c3; +disconnect c4; +DROP TABLE t1; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_consistent_snapshot.result b/mysql-test/suite/tidesdb/r/tidesdb_consistent_snapshot.result new file mode 100644 index 0000000000000..4a21c26d96cd6 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_consistent_snapshot.result @@ -0,0 +1,68 @@ +# +# Issue #64: WITH CONSISTENT SNAPSHOT doesn't work +# +CREATE TABLE t_snap64 ( +a INT, +b INT +) ENGINE=TidesDB; +# Seed some data so global_seq > 0 +INSERT INTO t_snap64 VALUES (100, 100); +DELETE FROM t_snap64 WHERE a = 100; +# ---- TEST 1: START TRANSACTION WITH CONSISTENT SNAPSHOT ---- +connect con2, localhost, root,,; +connection default; +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +# Insert from connection 2 AFTER snapshot +connection con2; +INSERT INTO t_snap64 (a, b) VALUES (1, 10); +SELECT * FROM t_snap64 ORDER BY a; +a b +1 10 +# Connection 1 should NOT see the row (snapshot was before insert) +connection default; +SELECT * FROM t_snap64 ORDER BY a; +a b +COMMIT; +# After COMMIT, a new snapshot should see the row +SELECT * FROM t_snap64 ORDER BY a; +a b +1 10 +# ---- TEST 2: Multiple inserts after snapshot ---- +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection con2; +INSERT INTO t_snap64 (a, b) VALUES (2, 20); +INSERT INTO t_snap64 (a, b) VALUES (3, 30); +connection default; +# Should still only see row (1,10) from before the snapshot +SELECT * FROM t_snap64 ORDER BY a; +a b +1 10 +COMMIT; +# After COMMIT, should see all 3 rows +SELECT * FROM t_snap64 ORDER BY a; +a b +1 10 +2 20 +3 30 +# ---- TEST 3: Without CONSISTENT SNAPSHOT, new data IS visible ---- +BEGIN; +connection con2; +INSERT INTO t_snap64 (a, b) VALUES (4, 40); +connection default; +# Without CONSISTENT SNAPSHOT, should see all 4 rows +SELECT * FROM t_snap64 ORDER BY a; +a b +1 10 +2 20 +3 30 +4 40 +COMMIT; +# Cleanup +connection con2; +disconnect con2; +connection default; +DROP TABLE t_snap64; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_crud.result b/mysql-test/suite/tidesdb/r/tidesdb_crud.result new file mode 100644 index 0000000000000..67830251f9dda --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_crud.result @@ -0,0 +1,379 @@ +# +# === Setup: install the TIDESDB engine plugin === +# +# +# ============================================ +# TEST 1: CREATE TABLE / SHOW CREATE TABLE +# ============================================ +# +CREATE TABLE t1 ( +id INT, +name VARCHAR(100), +score DECIMAL(10,2), +bio TEXT, +born DATE +) ENGINE=TIDESDB; +SHOW CREATE TABLE t1; +Table Create Table +t1 CREATE TABLE `t1` ( + `id` int(11) DEFAULT NULL, + `name` varchar(100) DEFAULT NULL, + `score` decimal(10,2) DEFAULT NULL, + `bio` text DEFAULT NULL, + `born` date DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +# +# ============================================ +# TEST 2: INSERT - single row +# ============================================ +# +INSERT INTO t1 VALUES (1, 'Alice', 95.50, 'First student', '2000-01-15'); +SELECT * FROM t1; +id name score bio born +1 Alice 95.50 First student 2000-01-15 +SELECT COUNT(*) AS cnt FROM t1; +cnt +1 +# +# ============================================ +# TEST 3: INSERT - multiple rows at once +# ============================================ +# +INSERT INTO t1 VALUES +(2, 'Bob', 88.00, 'Second student', '1999-06-20'), +(3, 'Charlie', 72.25, 'Third student', '2001-11-03'), +(4, 'Diana', 91.10, 'Fourth student', '1998-03-30'), +(5, 'Eve', 67.80, 'Fifth student', '2002-08-12'); +SELECT * FROM t1; +id name score bio born +1 Alice 95.50 First student 2000-01-15 +2 Bob 88.00 Second student 1999-06-20 +3 Charlie 72.25 Third student 2001-11-03 +4 Diana 91.10 Fourth student 1998-03-30 +5 Eve 67.80 Fifth student 2002-08-12 +SELECT COUNT(*) AS cnt FROM t1; +cnt +5 +# +# ============================================ +# TEST 4: SELECT with WHERE (full scan + filter) +# ============================================ +# +SELECT * FROM t1 WHERE id = 3; +id name score bio born +3 Charlie 72.25 Third student 2001-11-03 +SELECT * FROM t1 WHERE score > 90; +id name score bio born +1 Alice 95.50 First student 2000-01-15 +4 Diana 91.10 Fourth student 1998-03-30 +SELECT * FROM t1 WHERE name LIKE '%li%'; +id name score bio born +1 Alice 95.50 First student 2000-01-15 +3 Charlie 72.25 Third student 2001-11-03 +SELECT id, name FROM t1 WHERE id >= 2 AND id <= 4; +id name +2 Bob +3 Charlie +4 Diana +# +# ============================================ +# TEST 5: SELECT with ORDER BY +# (exercises position() and rnd_pos()) +# ============================================ +# +SELECT * FROM t1 ORDER BY score ASC; +id name score bio born +5 Eve 67.80 Fifth student 2002-08-12 +3 Charlie 72.25 Third student 2001-11-03 +2 Bob 88.00 Second student 1999-06-20 +4 Diana 91.10 Fourth student 1998-03-30 +1 Alice 95.50 First student 2000-01-15 +SELECT * FROM t1 ORDER BY name DESC; +id name score bio born +5 Eve 67.80 Fifth student 2002-08-12 +4 Diana 91.10 Fourth student 1998-03-30 +3 Charlie 72.25 Third student 2001-11-03 +2 Bob 88.00 Second student 1999-06-20 +1 Alice 95.50 First student 2000-01-15 +# +# ============================================ +# TEST 6: SELECT aggregate functions +# ============================================ +# +SELECT MIN(score) AS min_s, MAX(score) AS max_s, AVG(score) AS avg_s FROM t1; +min_s max_s avg_s +67.80 95.50 82.930000 +SELECT SUM(id) AS sum_id FROM t1; +sum_id +15 +# +# ============================================ +# TEST 7: UPDATE - single row via WHERE +# ============================================ +# +UPDATE t1 SET score = 99.99 WHERE id = 1; +SELECT * FROM t1 WHERE id = 1; +id name score bio born +1 Alice 99.99 First student 2000-01-15 +# +# ============================================ +# TEST 8: UPDATE - multiple rows +# ============================================ +# +UPDATE t1 SET bio = 'Updated bio' WHERE id IN (2, 4); +SELECT id, bio FROM t1 WHERE id IN (2, 4); +id bio +2 Updated bio +4 Updated bio +# +# ============================================ +# TEST 9: UPDATE - all rows (no WHERE) +# ============================================ +# +UPDATE t1 SET name = CONCAT(name, '!'); +SELECT id, name FROM t1; +id name +1 Alice! +2 Bob! +3 Charlie! +4 Diana! +5 Eve! +# +# ============================================ +# TEST 10: DELETE - single row +# ============================================ +# +DELETE FROM t1 WHERE id = 3; +SELECT COUNT(*) AS cnt FROM t1; +cnt +4 +SELECT * FROM t1; +id name score bio born +1 Alice! 99.99 First student 2000-01-15 +2 Bob! 88.00 Updated bio 1999-06-20 +4 Diana! 91.10 Updated bio 1998-03-30 +5 Eve! 67.80 Fifth student 2002-08-12 +# +# ============================================ +# TEST 11: DELETE - multiple rows via WHERE +# ============================================ +# +DELETE FROM t1 WHERE score < 90; +SELECT COUNT(*) AS cnt FROM t1; +cnt +2 +SELECT * FROM t1; +id name score bio born +1 Alice! 99.99 First student 2000-01-15 +4 Diana! 91.10 Updated bio 1998-03-30 +# +# ============================================ +# TEST 12: SELECT from empty result set +# ============================================ +# +SELECT * FROM t1 WHERE id = 999; +id name score bio born +# +# ============================================ +# TEST 13: DELETE - all remaining rows via DELETE +# ============================================ +# +DELETE FROM t1; +SELECT COUNT(*) AS cnt FROM t1; +cnt +0 +SELECT * FROM t1; +id name score bio born +# +# ============================================ +# TEST 14: Re-insert after full delete +# ============================================ +# +INSERT INTO t1 VALUES (10, 'Zara', 100.00, 'Re-inserted', '2005-05-05'); +SELECT * FROM t1; +id name score bio born +10 Zara 100.00 Re-inserted 2005-05-05 +# +# ============================================ +# TEST 15: TRUNCATE TABLE (delete_all_rows) +# ============================================ +# +INSERT INTO t1 VALUES (11, 'Yuki', 55.00, 'Will be truncated', '2006-06-06'); +SELECT COUNT(*) AS cnt FROM t1; +cnt +2 +TRUNCATE TABLE t1; +SELECT COUNT(*) AS cnt FROM t1; +cnt +0 +# +# ============================================ +# TEST 16: NULL handling +# ============================================ +# +INSERT INTO t1 VALUES (20, NULL, NULL, NULL, NULL); +INSERT INTO t1 VALUES (21, 'NotNull', 50.00, 'has data', '2010-01-01'); +SELECT * FROM t1; +id name score bio born +20 NULL NULL NULL NULL +21 NotNull 50.00 has data 2010-01-01 +SELECT * FROM t1 WHERE name IS NULL; +id name score bio born +20 NULL NULL NULL NULL +SELECT * FROM t1 WHERE name IS NOT NULL; +id name score bio born +21 NotNull 50.00 has data 2010-01-01 +# +# ============================================ +# TEST 17: Multiple data types stress +# ============================================ +# +DROP TABLE t1; +CREATE TABLE t2 ( +tiny_col TINYINT, +small_col SMALLINT, +med_col MEDIUMINT, +int_col INT, +big_col BIGINT, +float_col FLOAT, +double_col DOUBLE, +dec_col DECIMAL(20,5), +char_col CHAR(50), +vchar_col VARCHAR(200), +text_col TEXT, +date_col DATE, +dt_col DATETIME, +ts_col TIMESTAMP NULL +) ENGINE=TIDESDB; +INSERT INTO t2 VALUES ( +127, 32767, 8388607, 2147483647, 9223372036854775807, +3.14, 2.718281828, 12345.67890, +'fixed', 'variable length', 'long text here', +'2025-12-31', '2025-12-31 23:59:59', '2025-06-15 12:00:00' +); +SELECT * FROM t2; +tiny_col small_col med_col int_col big_col float_col double_col dec_col char_col vchar_col text_col date_col dt_col ts_col +127 32767 8388607 2147483647 9223372036854775807 3.14 2.718281828 12345.67890 fixed variable length long text here 2025-12-31 2025-12-31 23:59:59 2025-06-15 12:00:00 +UPDATE t2 SET char_col = 'UPDATED', int_col = 42; +SELECT char_col, int_col FROM t2; +char_col int_col +UPDATED 42 +DELETE FROM t2; +SELECT COUNT(*) AS cnt FROM t2; +cnt +0 +DROP TABLE t2; +# +# ============================================ +# TEST 18: Multiple independent tables +# ============================================ +# +CREATE TABLE ta (a INT, val VARCHAR(20)) ENGINE=TIDESDB; +CREATE TABLE tb (b INT, val VARCHAR(20)) ENGINE=TIDESDB; +INSERT INTO ta VALUES (1, 'ta_one'), (2, 'ta_two'); +INSERT INTO tb VALUES (1, 'tb_one'), (3, 'tb_three'); +SELECT * FROM ta; +a val +1 ta_one +2 ta_two +SELECT * FROM tb; +b val +1 tb_one +3 tb_three +SELECT ta.a, ta.val, tb.b, tb.val FROM ta, tb WHERE ta.a = tb.b; +a val b val +1 ta_one 1 tb_one +DROP TABLE ta, tb; +# +# ============================================ +# TEST 19: Empty table scan (no rows ever inserted) +# ============================================ +# +CREATE TABLE t_empty (x INT) ENGINE=TIDESDB; +SELECT * FROM t_empty; +x +SELECT COUNT(*) AS cnt FROM t_empty; +cnt +0 +DROP TABLE t_empty; +# +# ============================================ +# TEST 20: REPLACE (DELETE + INSERT internally) +# ============================================ +# +CREATE TABLE t3 (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t3 VALUES (1, 'original'); +SELECT * FROM t3; +id val +1 original +DROP TABLE t3; +# +# ============================================ +# TEST 21: INSERT ... SELECT +# ============================================ +# +CREATE TABLE t_src (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +CREATE TABLE t_dst (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_src VALUES (1, 'aaa'), (2, 'bbb'), (3, 'ccc'); +INSERT INTO t_dst SELECT * FROM t_src; +SELECT * FROM t_dst; +id val +1 aaa +2 bbb +3 ccc +DROP TABLE t_src, t_dst; +# +# ============================================ +# TEST 22: UPDATE with expression +# ============================================ +# +CREATE TABLE t4 (id INT, counter INT) ENGINE=TIDESDB; +INSERT INTO t4 VALUES (1, 0), (2, 10), (3, 20); +UPDATE t4 SET counter = counter + 5; +SELECT * FROM t4; +id counter +1 5 +2 15 +3 25 +UPDATE t4 SET counter = counter * 2 WHERE id > 1; +SELECT * FROM t4; +id counter +1 5 +2 30 +3 50 +DROP TABLE t4; +# +# ============================================ +# TEST 23: Large-ish batch insert + delete +# ============================================ +# +CREATE TABLE t_batch (id INT, padding VARCHAR(100)) ENGINE=TIDESDB; +SELECT COUNT(*) AS cnt FROM t_batch; +cnt +100 +DELETE FROM t_batch WHERE id > 50; +SELECT COUNT(*) AS cnt FROM t_batch; +cnt +50 +DELETE FROM t_batch WHERE id <= 25; +SELECT COUNT(*) AS cnt FROM t_batch; +cnt +25 +TRUNCATE TABLE t_batch; +SELECT COUNT(*) AS cnt FROM t_batch; +cnt +0 +DROP TABLE t_batch; +# +# ============================================ +# TEST 24: DROP TABLE (delete_table) +# ============================================ +# +CREATE TABLE t_drop (a INT) ENGINE=TIDESDB; +INSERT INTO t_drop VALUES (1), (2), (3); +DROP TABLE t_drop; +SELECT * FROM t_drop; +ERROR 42S02: Table 'test.t_drop' doesn't exist +# +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_data_home_dir.result b/mysql-test/suite/tidesdb/r/tidesdb_data_home_dir.result new file mode 100644 index 0000000000000..0ffe0fcd99c43 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_data_home_dir.result @@ -0,0 +1,10 @@ +# +# Verify tidesdb_data_home_dir is visible and read-only +# +SHOW VARIABLES LIKE 'tidesdb_data_home_dir'; +Variable_name Value +tidesdb_data_home_dir +SET GLOBAL tidesdb_data_home_dir = '/tmp/test'; +ERROR HY000: Variable 'tidesdb_data_home_dir' is a read only variable +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_defaults_alignment.result b/mysql-test/suite/tidesdb/r/tidesdb_defaults_alignment.result new file mode 100644 index 0000000000000..75bf8979b6441 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_defaults_alignment.result @@ -0,0 +1,75 @@ +# library-aligned column-family defaults +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_min_levels'; +Variable_name Value +tidesdb_default_min_levels 1 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_dividing_level_offset'; +Variable_name Value +tidesdb_default_dividing_level_offset 1 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_level_size_ratio'; +Variable_name Value +tidesdb_default_level_size_ratio 10 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_klog_value_threshold'; +Variable_name Value +tidesdb_default_klog_value_threshold 512 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_bloom_filter'; +Variable_name Value +tidesdb_default_bloom_filter ON +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_bloom_fpr'; +Variable_name Value +tidesdb_default_bloom_fpr 100 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_block_indexes'; +Variable_name Value +tidesdb_default_block_indexes ON +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_index_sample_ratio'; +Variable_name Value +tidesdb_default_index_sample_ratio 1 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_block_index_prefix_len'; +Variable_name Value +tidesdb_default_block_index_prefix_len 16 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_skip_list_max_level'; +Variable_name Value +tidesdb_default_skip_list_max_level 12 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_skip_list_probability'; +Variable_name Value +tidesdb_default_skip_list_probability 25 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_min_disk_space'; +Variable_name Value +tidesdb_default_min_disk_space 104857600 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_l1_file_count_trigger'; +Variable_name Value +tidesdb_default_l1_file_count_trigger 4 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_l0_queue_stall_threshold'; +Variable_name Value +tidesdb_default_l0_queue_stall_threshold 10 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_tombstone_density_trigger'; +Variable_name Value +tidesdb_default_tombstone_density_trigger 0 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_tombstone_density_min_entries'; +Variable_name Value +tidesdb_default_tombstone_density_min_entries 1024 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_compression'; +Variable_name Value +tidesdb_default_compression LZ4 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_use_btree'; +Variable_name Value +tidesdb_default_use_btree OFF +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_object_lazy_compaction'; +Variable_name Value +tidesdb_default_object_lazy_compaction OFF +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_object_prefetch_compaction'; +Variable_name Value +tidesdb_default_object_prefetch_compaction ON +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_sync_interval_us'; +Variable_name Value +tidesdb_default_sync_interval_us 128000 +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_write_buffer_size'; +Variable_name Value +tidesdb_default_write_buffer_size 67108864 +# deliberate deviations from the library default, see README +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_sync_mode'; +Variable_name Value +tidesdb_default_sync_mode FULL +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_isolation_level'; +Variable_name Value +tidesdb_default_isolation_level REPEATABLE_READ +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_drop_create.result b/mysql-test/suite/tidesdb/r/tidesdb_drop_create.result new file mode 100644 index 0000000000000..6ad9fbe392eef --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_drop_create.result @@ -0,0 +1,79 @@ +# +# Issue #57: Data survives DROP + CREATE +# +# ---- TEST 1: DROP TABLE must destroy data ---- +CREATE TABLE t_drop57 (i INT NOT NULL PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_drop57 VALUES (1, 'aaa'), (2, 'bbb'), (3, 'ccc'); +SELECT * FROM t_drop57 ORDER BY i; +i v +1 aaa +2 bbb +3 ccc +DROP TABLE t_drop57; +CREATE TABLE t_drop57 (i INT NOT NULL PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +# Must be empty after DROP + CREATE +SELECT COUNT(*) FROM t_drop57; +COUNT(*) +0 +SELECT * FROM t_drop57 ORDER BY i; +i v +DROP TABLE t_drop57; +# ---- TEST 2: CREATE OR REPLACE must destroy data ---- +CREATE TABLE t_cor57 (i INT) ENGINE=TidesDB; +INSERT INTO t_cor57 VALUES (10), (20), (30); +SELECT * FROM t_cor57 ORDER BY i; +i +10 +20 +30 +CREATE OR REPLACE TABLE t_cor57 (i INT) ENGINE=TidesDB; +# Must be empty after CREATE OR REPLACE +SELECT COUNT(*) FROM t_cor57; +COUNT(*) +0 +SELECT * FROM t_cor57 ORDER BY i; +i +DROP TABLE t_cor57; +# ---- TEST 3: Secondary indexes must also be cleaned ---- +CREATE TABLE t_idx57 ( +id INT NOT NULL PRIMARY KEY, +val INT NOT NULL, +KEY idx_val (val) +) ENGINE=TidesDB; +INSERT INTO t_idx57 VALUES (1, 100), (2, 200), (3, 300); +SELECT * FROM t_idx57 ORDER BY id; +id val +1 100 +2 200 +3 300 +SELECT val FROM t_idx57 WHERE val = 200; +val +200 +DROP TABLE t_idx57; +CREATE TABLE t_idx57 ( +id INT NOT NULL PRIMARY KEY, +val INT NOT NULL, +KEY idx_val (val) +) ENGINE=TidesDB; +# Must be empty after DROP + CREATE (including index) +SELECT COUNT(*) FROM t_idx57; +COUNT(*) +0 +SELECT * FROM t_idx57 ORDER BY id; +id val +SELECT val FROM t_idx57 WHERE val = 200; +val +DROP TABLE t_idx57; +# ---- TEST 4: TRUNCATE TABLE still works ---- +CREATE TABLE t_trunc57 (i INT NOT NULL PRIMARY KEY) ENGINE=TidesDB; +INSERT INTO t_trunc57 VALUES (1), (2), (3); +SELECT COUNT(*) FROM t_trunc57; +COUNT(*) +3 +TRUNCATE TABLE t_trunc57; +SELECT COUNT(*) FROM t_trunc57; +COUNT(*) +0 +DROP TABLE t_trunc57; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_encryption.result b/mysql-test/suite/tidesdb/r/tidesdb_encryption.result new file mode 100644 index 0000000000000..b684e7bc30e3d --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_encryption.result @@ -0,0 +1,143 @@ +# +# ============================================ +# TEST 1: Basic encrypted table - CRUD +# ============================================ +# +CREATE TABLE t_enc1 ( +id INT NOT NULL PRIMARY KEY, +val VARCHAR(100) +) ENGINE=TIDESDB `ENCRYPTED`=YES; +INSERT INTO t_enc1 VALUES (1, 'secret_one'); +INSERT INTO t_enc1 VALUES (2, 'secret_two'); +INSERT INTO t_enc1 VALUES (3, 'secret_three'); +SELECT * FROM t_enc1 ORDER BY id; +id val +1 secret_one +2 secret_two +3 secret_three +UPDATE t_enc1 SET val = 'updated_secret' WHERE id = 2; +SELECT * FROM t_enc1 WHERE id = 2; +id val +2 updated_secret +DELETE FROM t_enc1 WHERE id = 1; +SELECT * FROM t_enc1 ORDER BY id; +id val +2 updated_secret +3 secret_three +DROP TABLE t_enc1; +# +# ============================================ +# TEST 2: SHOW CREATE TABLE shows ENCRYPTED option +# ============================================ +# +CREATE TABLE t_enc2 ( +id INT NOT NULL PRIMARY KEY, +name VARCHAR(50), +amount INT +) ENGINE=TIDESDB `ENCRYPTED`=YES `ENCRYPTION_KEY_ID`=2; +SHOW CREATE TABLE t_enc2; +Table Create Table +t_enc2 CREATE TABLE `t_enc2` ( + `id` int(11) NOT NULL, + `name` varchar(50) DEFAULT NULL, + `amount` int(11) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `ENCRYPTED`=YES `ENCRYPTION_KEY_ID`=2 +INSERT INTO t_enc2 VALUES (1, 'alice', 100); +SELECT * FROM t_enc2; +id name amount +1 alice 100 +DROP TABLE t_enc2; +# +# ============================================ +# TEST 3: Non-encrypted table still works +# ============================================ +# +CREATE TABLE t_noenc ( +id INT NOT NULL PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB; +INSERT INTO t_noenc VALUES (1, 'plain_text'); +SELECT * FROM t_noenc; +id val +1 plain_text +DROP TABLE t_noenc; +# +# ============================================ +# TEST 4: Encrypted table with secondary index +# ============================================ +# +CREATE TABLE t_enc_idx ( +id INT NOT NULL PRIMARY KEY, +name VARCHAR(50), +age INT, +KEY idx_name (name) +) ENGINE=TIDESDB `ENCRYPTED`=YES; +INSERT INTO t_enc_idx VALUES (1, 'alice', 30); +INSERT INTO t_enc_idx VALUES (2, 'bob', 25); +INSERT INTO t_enc_idx VALUES (3, 'charlie', 35); +INSERT INTO t_enc_idx VALUES (4, 'alice', 28); +SELECT * FROM t_enc_idx WHERE name = 'alice' ORDER BY id; +id name age +1 alice 30 +4 alice 28 +SELECT * FROM t_enc_idx ORDER BY id; +id name age +1 alice 30 +2 bob 25 +3 charlie 35 +4 alice 28 +DROP TABLE t_enc_idx; +# +# ============================================ +# TEST 5: Encrypted table with AUTO_INCREMENT +# ============================================ +# +CREATE TABLE t_enc_auto ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +data VARCHAR(100) +) ENGINE=TIDESDB `ENCRYPTED`=YES; +INSERT INTO t_enc_auto (data) VALUES ('row_a'); +INSERT INTO t_enc_auto (data) VALUES ('row_b'); +INSERT INTO t_enc_auto (data) VALUES ('row_c'); +SELECT * FROM t_enc_auto ORDER BY id; +id data +1 row_a +2 row_b +3 row_c +DROP TABLE t_enc_auto; +# +# ============================================ +# TEST 6: Encrypted table with BLOB data +# ============================================ +# +CREATE TABLE t_enc_blob ( +id INT NOT NULL PRIMARY KEY, +payload BLOB +) ENGINE=TIDESDB `ENCRYPTED`=YES; +INSERT INTO t_enc_blob VALUES (1, REPEAT('A', 500)); +INSERT INTO t_enc_blob VALUES (2, REPEAT('B', 1000)); +SELECT id, LENGTH(payload) AS plen, LEFT(payload, 5) AS head FROM t_enc_blob ORDER BY id; +id plen head +1 500 AAAAA +2 1000 BBBBB +DROP TABLE t_enc_blob; +# +# ============================================ +# TEST 7: Encrypted table with NULL values +# ============================================ +# +CREATE TABLE t_enc_null ( +id INT NOT NULL PRIMARY KEY, +val VARCHAR(50) NULL +) ENGINE=TIDESDB `ENCRYPTED`=YES; +INSERT INTO t_enc_null VALUES (1, NULL); +INSERT INTO t_enc_null VALUES (2, 'not_null'); +INSERT INTO t_enc_null VALUES (3, NULL); +SELECT * FROM t_enc_null ORDER BY id; +id val +1 NULL +2 not_null +3 NULL +DROP TABLE t_enc_null; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_encryption_rotation.result b/mysql-test/suite/tidesdb/r/tidesdb_encryption_rotation.result new file mode 100644 index 0000000000000..2331cdb6ea551 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_encryption_rotation.result @@ -0,0 +1,47 @@ +# +# rows encrypted under key version 1 +# +CREATE TABLE enc (id INT PRIMARY KEY, payload VARCHAR(200)) ENGINE=TidesDB `ENCRYPTED`=YES; +INSERT INTO enc VALUES (1,'written under version one'),(2,'also version one'); +SELECT * FROM enc ORDER BY id; +id payload +1 written under version one +2 also version one +# +# rotate the key, then write rows under key version 2 +# +SET GLOBAL debug_key_management_version = 2; +INSERT INTO enc VALUES (3,'written under version two'),(4,'also version two'); +# all four rows decrypt, the first two under v1 and the rest under v2 +SELECT * FROM enc ORDER BY id; +id payload +1 written under version one +2 also version one +3 written under version two +4 also version two +# +# rotate again and confirm all three key vintages still read back +# +SET GLOBAL debug_key_management_version = 3; +INSERT INTO enc VALUES (5,'written under version three'); +SELECT * FROM enc ORDER BY id; +id payload +1 written under version one +2 also version one +3 written under version two +4 also version two +5 written under version three +# +# a fresh open of the table still reads every version +# +FLUSH TABLES; +SELECT * FROM enc ORDER BY id; +id payload +1 written under version one +2 also version one +3 written under version two +4 also version two +5 written under version three +DROP TABLE enc; +SET GLOBAL debug_key_management_version = DEFAULT; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_engine_convert.result b/mysql-test/suite/tidesdb/r/tidesdb_engine_convert.result new file mode 100644 index 0000000000000..9ff5d29576d07 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_engine_convert.result @@ -0,0 +1,126 @@ +# +# TEST 1: InnoDB -> TidesDB migration +# +CREATE TABLE t_innodb ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +name VARCHAR(100), +val DECIMAL(10,2), +created DATETIME DEFAULT CURRENT_TIMESTAMP, +KEY idx_name (name) +) ENGINE=InnoDB; +INSERT INTO t_innodb (name, val) VALUES ('alpha', 1.50), ('beta', 2.75), ('gamma', 3.00); +INSERT INTO t_innodb (name, val) VALUES ('delta', 4.25), ('epsilon', 5.50); +SELECT id, name, val FROM t_innodb ORDER BY id; +id name val +1 alpha 1.50 +2 beta 2.75 +3 gamma 3.00 +4 delta 4.25 +5 epsilon 5.50 +ALTER TABLE t_innodb ENGINE=TidesDB; +Warnings: +Note 1071 Specified key was too long; max key length is 255 bytes +SHOW CREATE TABLE t_innodb; +Table Create Table +t_innodb CREATE TABLE `t_innodb` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `name` varchar(100) DEFAULT NULL, + `val` decimal(10,2) DEFAULT NULL, + `created` datetime DEFAULT current_timestamp(), + PRIMARY KEY (`id`), + KEY `idx_name` (`name`(63)) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +SELECT id, name, val FROM t_innodb ORDER BY id; +id name val +1 alpha 1.50 +2 beta 2.75 +3 gamma 3.00 +4 delta 4.25 +5 epsilon 5.50 +SELECT name FROM t_innodb WHERE name = 'gamma'; +name +gamma +# +# TEST 2: TidesDB -> InnoDB migration +# +ALTER TABLE t_innodb ENGINE=InnoDB; +SELECT id, name, val FROM t_innodb ORDER BY id; +id name val +1 alpha 1.50 +2 beta 2.75 +3 gamma 3.00 +4 delta 4.25 +5 epsilon 5.50 +SELECT name FROM t_innodb WHERE name = 'delta'; +name +delta +# +# TEST 3: Round-trip InnoDB -> TidesDB -> InnoDB +# +CREATE TABLE t_round (id INT PRIMARY KEY, data TEXT) ENGINE=InnoDB; +INSERT INTO t_round VALUES (1, REPEAT('X', 5000)), (2, REPEAT('Y', 5000)); +ALTER TABLE t_round ENGINE=TidesDB; +SELECT id, LENGTH(data) FROM t_round ORDER BY id; +id LENGTH(data) +1 5000 +2 5000 +ALTER TABLE t_round ENGINE=InnoDB; +SELECT id, LENGTH(data) FROM t_round ORDER BY id; +id LENGTH(data) +1 5000 +2 5000 +# +# TEST 4: Migration with BLOB columns +# +CREATE TABLE t_blob_mig ( +id INT PRIMARY KEY, +img LONGBLOB, +descr TEXT +) ENGINE=InnoDB; +INSERT INTO t_blob_mig VALUES (1, REPEAT('A', 100000), 'first image'); +INSERT INTO t_blob_mig VALUES (2, REPEAT('B', 100000), 'second image'); +ALTER TABLE t_blob_mig ENGINE=TidesDB; +SELECT id, LENGTH(img), descr FROM t_blob_mig ORDER BY id; +id LENGTH(img) descr +1 100000 first image +2 100000 second image +# +# TEST 5: Migration preserves auto-increment +# +CREATE TABLE t_ai (id INT AUTO_INCREMENT PRIMARY KEY, v INT) ENGINE=InnoDB; +INSERT INTO t_ai (v) VALUES (10), (20), (30); +ALTER TABLE t_ai ENGINE=TidesDB; +INSERT INTO t_ai (v) VALUES (40); +SELECT * FROM t_ai ORDER BY id; +id v +1 10 +2 20 +3 30 +4 40 +# +# TEST 6: Migration with composite PK and multiple indexes +# +CREATE TABLE t_complex ( +a INT NOT NULL, +b INT NOT NULL, +c VARCHAR(50), +d INT, +PRIMARY KEY (a, b), +KEY idx_c (c), +KEY idx_d (d) +) ENGINE=InnoDB; +INSERT INTO t_complex VALUES (1,1,'foo',100), (1,2,'bar',200), (2,1,'baz',100); +ALTER TABLE t_complex ENGINE=TidesDB; +SELECT * FROM t_complex WHERE a = 1 ORDER BY b; +a b c d +1 1 foo 100 +1 2 bar 200 +SELECT c FROM t_complex WHERE d = 100 ORDER BY c; +c +baz +foo +# +# Cleanup +# +DROP TABLE t_innodb, t_round, t_blob_mig, t_ai, t_complex; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_engine_status.result b/mysql-test/suite/tidesdb/r/tidesdb_engine_status.result new file mode 100644 index 0000000000000..6b5cbae66227d --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_engine_status.result @@ -0,0 +1,48 @@ +# +# SHOW ENGINE TIDESDB STATUS should return output +# +CREATE TABLE t1 (id INT PRIMARY KEY, val INT) ENGINE=TidesDB; +INSERT INTO t1 VALUES (1,10),(2,20),(3,30); +SHOW ENGINE TIDESDB STATUS; +Type Name Status +TIDESDB ================== TidesDB Engine Status ================== +Data directory: TIDESDB_DATA_DIR +Unified memtable: ON +Column families: N +Global sequence: N + +--- Memory --- +Total system memory: N MB +Resolved memory limit: N MB +Memory pressure level: N +Total memtable bytes: N +Transaction memory bytes: N + +--- Storage --- +Total SSTables: N +Open SSTable handles: N +Total data size: N bytes +Immutable memtables: N + +--- Background --- +Flush pending: N +Flush queue size: N +Compaction queue size: N + +--- Block Cache --- +Enabled: YES +Entries: N +Size: N bytes +Hits: N +Misses: N +Hit rate: N.N% +Partitions: N + +--- Tombstones --- +Total tombstones: N +Tombstone ratio: N.N% +Worst SSTable density: N.N% at level N + +DROP TABLE t1; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_fts_blend_chars.result b/mysql-test/suite/tidesdb/r/tidesdb_fts_blend_chars.result new file mode 100644 index 0000000000000..24b916636c303 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_fts_blend_chars.result @@ -0,0 +1,84 @@ +# +# TidesDB FTS blend_chars support for Romance language elision +# +SET GLOBAL tidesdb_fts_blend_chars = "'"; +CREATE TABLE docs ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +body TEXT, +FULLTEXT KEY ft_body (body) +) ENGINE=TidesDB; +INSERT INTO docs (body) VALUES +("L'aria fresca della montagna"), +("Dell'aria pura si respira bene"), +("Un'aria di festa pervadeva la piazza"), +("O'Malley went to the store"), +("The cat sat on the mat"); +# Sub-part search: aria matches Italian elision docs +SELECT id FROM docs WHERE MATCH(body) AGAINST('aria') ORDER BY id; +id +1 +2 +3 +# Blended form: l'aria ranks doc 1 highest +SELECT id FROM docs WHERE MATCH(body) AGAINST("l'aria") ORDER BY id; +id +1 +2 +3 +# Sub-part: malley finds O'Malley +SELECT id FROM docs WHERE MATCH(body) AGAINST('malley') ORDER BY id; +id +4 +# Blended form: o'malley +SELECT id FROM docs WHERE MATCH(body) AGAINST("o'malley") ORDER BY id; +id +4 +# Blended form: dell'aria +SELECT id FROM docs WHERE MATCH(body) AGAINST("dell'aria") ORDER BY id; +id +1 +2 +3 +# Non-blend word: cat (should still work) +SELECT id FROM docs WHERE MATCH(body) AGAINST('cat') ORDER BY id; +id +5 +# Stop word through blend: the (still filtered) +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +COUNT(*) +0 +# Boolean mode with blend chars +SELECT id FROM docs WHERE MATCH(body) AGAINST("+aria -malley" IN BOOLEAN MODE) ORDER BY id; +id +1 +2 +3 +# Update with blended content +UPDATE docs SET body = "L'orchestra dell'opera suona bene" WHERE id = 5; +SELECT id FROM docs WHERE MATCH(body) AGAINST('orchestra') ORDER BY id; +id +5 +SELECT id FROM docs WHERE MATCH(body) AGAINST("dell'opera") ORDER BY id; +id +2 +5 +# Insert more elision forms +INSERT INTO docs (body) VALUES +("Nell'acqua limpida del lago"), +("All'interno del castello medievale"); +SELECT id FROM docs WHERE MATCH(body) AGAINST('acqua') ORDER BY id; +id +6 +SELECT id FROM docs WHERE MATCH(body) AGAINST("nell'acqua") ORDER BY id; +id +6 +SELECT id FROM docs WHERE MATCH(body) AGAINST('interno') ORDER BY id; +id +7 +# Verify sysvar +SHOW GLOBAL VARIABLES LIKE 'tidesdb_fts_blend_chars'; +Variable_name Value +tidesdb_fts_blend_chars +# Reset blend chars +SET GLOBAL tidesdb_fts_blend_chars = NULL; +DROP TABLE docs; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_fts_stopword_table.result b/mysql-test/suite/tidesdb/r/tidesdb_fts_stopword_table.result new file mode 100644 index 0000000000000..75d71df4abbb8 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_fts_stopword_table.result @@ -0,0 +1,19 @@ +# a TidesDB table holding one custom stop word per row +CREATE TABLE swords (value VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO swords VALUES ('zebra'), ('quokka'); +# point the engine at the custom stop word table +SET GLOBAL tidesdb_ft_stopword_table = 'test/swords'; +# build a full-text document that contains a custom stop word +CREATE TABLE docs (id INT PRIMARY KEY, body TEXT, FULLTEXT (body)) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'zebra crossing beside the apple tree'); +# zebra is now a stop word, so it is never indexed and matches nothing +SELECT id FROM docs WHERE MATCH(body) AGAINST('zebra' IN BOOLEAN MODE); +id +# a normal word still matches +SELECT id FROM docs WHERE MATCH(body) AGAINST('apple' IN BOOLEAN MODE); +id +1 +DROP TABLE docs; +DROP TABLE swords; +SET GLOBAL tidesdb_ft_stopword_table = DEFAULT; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_fts_stopwords.result b/mysql-test/suite/tidesdb/r/tidesdb_fts_stopwords.result new file mode 100644 index 0000000000000..89a1857891767 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_fts_stopwords.result @@ -0,0 +1,117 @@ +# +# TidesDB FTS stop word filtering +# +CREATE TABLE docs ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +body TEXT, +FULLTEXT KEY ft_body (body) +) ENGINE=TidesDB; +INSERT INTO docs (body) VALUES +('The quick brown fox jumps over the lazy dog'), +('A man is walking in the park with his dog'), +('How to build a house from scratch'), +('This is a test of the emergency broadcast system'), +('The cat sat on the mat by the door'); +# Stop words should return 0 rows +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('is'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('a'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('of'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('in'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('on'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('by'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('with'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('for'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('this'); +COUNT(*) +0 +# Real words should return matches +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('fox'); +COUNT(*) +1 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('dog'); +COUNT(*) +2 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('house'); +COUNT(*) +1 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('cat'); +COUNT(*) +1 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('emergency'); +COUNT(*) +1 +# Boolean mode with stop words +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('+dog' IN BOOLEAN MODE); +COUNT(*) +2 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('+the' IN BOOLEAN MODE); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('+dog -cat' IN BOOLEAN MODE); +COUNT(*) +2 +# Multi-word query mixing stop words and real words +SELECT id FROM docs WHERE MATCH(body) AGAINST('quick brown') ORDER BY id; +id +1 +SELECT id FROM docs WHERE MATCH(body) AGAINST('build house') ORDER BY id; +id +3 +# Verify stop word sysvar exists and defaults +SHOW GLOBAL VARIABLES LIKE 'tidesdb_ft_stopword_table'; +Variable_name Value +tidesdb_ft_stopword_table +# Insert more rows after initial index creation +INSERT INTO docs (body) VALUES +('The world is a beautiful place to live in'), +('Building bridges for the future of our community'); +# Stop words still filtered for new rows +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('is'); +COUNT(*) +0 +# Real words from new rows work +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('beautiful'); +COUNT(*) +1 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('bridges'); +COUNT(*) +1 +# UPDATE should maintain stop word filtering +UPDATE docs SET body = 'The revised document about the important topic' WHERE id = 1; +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +COUNT(*) +0 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('revised'); +COUNT(*) +1 +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('important'); +COUNT(*) +1 +# DELETE and verify +DELETE FROM docs WHERE id = 2; +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('dog'); +COUNT(*) +0 +DROP TABLE docs; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_fulltext.result b/mysql-test/suite/tidesdb/r/tidesdb_fulltext.result new file mode 100644 index 0000000000000..32d6b0fe3d600 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_fulltext.result @@ -0,0 +1,122 @@ +# +# Setup +# +CREATE TABLE articles ( +id INT NOT NULL PRIMARY KEY, +title VARCHAR(200), +body TEXT, +FULLTEXT ft_content (title, body) +) ENGINE=TidesDB; +INSERT INTO articles VALUES (1, 'MySQL Tutorial', 'DBMS stands for DataBase Management System'); +INSERT INTO articles VALUES (2, 'How To Use MySQL', 'After you went through a tutorial you can start'); +INSERT INTO articles VALUES (3, 'Optimizing MySQL', 'In this tutorial we show optimization techniques'); +INSERT INTO articles VALUES (4, 'TidesDB Guide', 'TidesDB is an LSM tree storage engine'); +INSERT INTO articles VALUES (5, 'Database Systems', 'A database management system manages data efficiently'); +# +# TEST 1: Natural language search +# +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('tutorial') +ORDER BY MATCH(title, body) AGAINST('tutorial') DESC; +id title +3 Optimizing MySQL +1 MySQL Tutorial +2 How To Use MySQL +# +# TEST 2: Multi-term natural language search +# +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('database management') +ORDER BY MATCH(title, body) AGAINST('database management') DESC; +id title +5 Database Systems +1 MySQL Tutorial +# +# TEST 3: No match returns empty +# +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('nonexistent'); +id title +# +# TEST 4: Boolean mode - required term +# +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('+mysql +tutorial' IN BOOLEAN MODE) +ORDER BY id; +id title +1 MySQL Tutorial +2 How To Use MySQL +3 Optimizing MySQL +# +# TEST 5: Boolean mode - excluded term +# +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('+mysql -tutorial' IN BOOLEAN MODE) +ORDER BY id; +id title +# +# TEST 6: Boolean mode - prefix wildcard +# +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('optim*' IN BOOLEAN MODE) +ORDER BY id; +id title +3 Optimizing MySQL +# +# TEST 7: UPDATE changes FTS results +# +UPDATE articles SET body = 'This tutorial covers advanced optimization and tuning' WHERE id = 4; +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('tutorial') +ORDER BY MATCH(title, body) AGAINST('tutorial') DESC; +id title +3 Optimizing MySQL +1 MySQL Tutorial +4 TidesDB Guide +2 How To Use MySQL +# +# TEST 8: DELETE removes from FTS results +# +DELETE FROM articles WHERE id = 3; +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('tutorial') +ORDER BY MATCH(title, body) AGAINST('tutorial') DESC; +id title +1 MySQL Tutorial +4 TidesDB Guide +2 How To Use MySQL +# +# TEST 9: Single-column FULLTEXT index +# +DROP TABLE articles; +CREATE TABLE articles ( +id INT NOT NULL PRIMARY KEY, +title VARCHAR(200), +FULLTEXT (title) +) ENGINE=TidesDB; +INSERT INTO articles VALUES (1, 'Introduction to MySQL'); +INSERT INTO articles VALUES (2, 'Advanced PostgreSQL'); +INSERT INTO articles VALUES (3, 'MySQL Performance Tuning'); +SELECT id, title FROM articles +WHERE MATCH(title) AGAINST('mysql') +ORDER BY MATCH(title) AGAINST('mysql') DESC; +id title +1 Introduction to MySQL +3 MySQL Performance Tuning +# +# TEST 10: Oversize query terms must not overflow the stack key buffer. +# fts_build_key truncates inserted keys to 512 bytes, but a user can pass +# a multi-byte search term whose byte length exceeds the on-disk cap. +# The query must complete without crashing and return no match. +# +SELECT id, title FROM articles +WHERE MATCH(title) AGAINST(REPEAT('a', 1024) IN BOOLEAN MODE); +id title +SELECT id, title FROM articles +WHERE MATCH(title) AGAINST(CONCAT(REPEAT('a', 1024), '*') IN BOOLEAN MODE); +id title +# +# Cleanup +# +DROP TABLE articles; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_fulltext_phrase.result b/mysql-test/suite/tidesdb/r/tidesdb_fulltext_phrase.result new file mode 100644 index 0000000000000..9736eb6ca11ae --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_fulltext_phrase.result @@ -0,0 +1,90 @@ +# +# Setup +# +CREATE TABLE docs ( +id INT NOT NULL PRIMARY KEY, +body TEXT, +FULLTEXT (body) +) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'the quick brown fox jumps over the lazy dog'); +INSERT INTO docs VALUES (2, 'quick fox and lazy dog play together'); +INSERT INTO docs VALUES (3, 'the brown dog is not lazy at all'); +INSERT INTO docs VALUES (4, 'completely unrelated content here'); +INSERT INTO docs VALUES (5, 'the fox is quick and the dog is lazy'); +# +# TEST 1: Exact phrase match +# +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"quick brown fox"' IN BOOLEAN MODE) ORDER BY id; +id +1 +# +# TEST 2: Phrase appears in multiple rows +# +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"lazy dog"' IN BOOLEAN MODE) ORDER BY id; +id +1 +2 +# +# TEST 3: Phrase with wrong word order (no match) +# +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"fox quick"' IN BOOLEAN MODE) ORDER BY id; +id +5 +# +# TEST 4: Phrase + required term +# +SELECT id FROM docs +WHERE MATCH(body) AGAINST('+"lazy dog" +fox' IN BOOLEAN MODE) ORDER BY id; +id +1 +2 +# +# TEST 5: Phrase + excluded term +# +SELECT id FROM docs +WHERE MATCH(body) AGAINST('+"lazy dog" -quick' IN BOOLEAN MODE) ORDER BY id; +id +# +# TEST 6: Wildcard with multiple matching lengths +# +DROP TABLE docs; +CREATE TABLE docs (id INT PRIMARY KEY, body TEXT, FULLTEXT(body)) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'optimization techniques are important'); +INSERT INTO docs VALUES (2, 'optimizing queries is essential'); +INSERT INTO docs VALUES (3, 'the optimal solution exists'); +INSERT INTO docs VALUES (4, 'nothing related here'); +SELECT id FROM docs +WHERE MATCH(body) AGAINST('optim*' IN BOOLEAN MODE) ORDER BY id; +id +1 +2 +3 +# +# TEST 7: Wildcard with short prefix +# +SELECT id FROM docs +WHERE MATCH(body) AGAINST('opt*' IN BOOLEAN MODE) ORDER BY id; +id +1 +2 +3 +# +# TEST 8: Two-word phrase +# +DROP TABLE docs; +CREATE TABLE docs (id INT PRIMARY KEY, body TEXT, FULLTEXT(body)) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'database management system'); +INSERT INTO docs VALUES (2, 'management of databases'); +INSERT INTO docs VALUES (3, 'the database has good management'); +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"database management"' IN BOOLEAN MODE) ORDER BY id; +id +1 +# +# Cleanup +# +DROP TABLE docs; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_hidden_pk.result b/mysql-test/suite/tidesdb/r/tidesdb_hidden_pk.result new file mode 100644 index 0000000000000..01d3e73453ec8 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_hidden_pk.result @@ -0,0 +1,64 @@ +# +# TEST 1: Basic CRUD without PK +# +CREATE TABLE t_nopk (a INT, b VARCHAR(100)) ENGINE=TidesDB; +INSERT INTO t_nopk VALUES (1, 'one'), (2, 'two'), (3, 'three'); +INSERT INTO t_nopk VALUES (1, 'duplicate_a'); +SELECT * FROM t_nopk ORDER BY a, b; +a b +1 duplicate_a +1 one +2 two +3 three +# +# TEST 2: UPDATE and DELETE without PK +# +UPDATE t_nopk SET b = 'UPDATED' WHERE a = 2; +SELECT * FROM t_nopk WHERE a = 2; +a b +2 UPDATED +DELETE FROM t_nopk WHERE b = 'duplicate_a'; +SELECT * FROM t_nopk ORDER BY a; +a b +1 one +2 UPDATED +3 three +# +# TEST 3: Hidden PK with secondary index +# +CREATE TABLE t_nopk_idx (x INT, y INT, KEY(x)) ENGINE=TidesDB; +INSERT INTO t_nopk_idx VALUES (10, 100), (20, 200), (10, 300), (30, 400); +SELECT y FROM t_nopk_idx WHERE x = 10 ORDER BY y; +y +100 +300 +SELECT COUNT(*) FROM t_nopk_idx; +COUNT(*) +4 +# +# TEST 4: Hidden PK with BLOB +# +CREATE TABLE t_nopk_blob (data LONGBLOB, tag VARCHAR(20)) ENGINE=TidesDB; +INSERT INTO t_nopk_blob VALUES (REPEAT('X', 50000), 'big'); +INSERT INTO t_nopk_blob VALUES (REPEAT('Y', 100), 'small'); +SELECT tag, LENGTH(data) FROM t_nopk_blob ORDER BY tag; +tag LENGTH(data) +big 50000 +small 100 +UPDATE t_nopk_blob SET data = REPEAT('Z', 60000) WHERE tag = 'big'; +SELECT tag, LENGTH(data) FROM t_nopk_blob WHERE tag = 'big'; +tag LENGTH(data) +big 60000 +# +# TEST 5: TRUNCATE hidden PK table +# +TRUNCATE TABLE t_nopk; +INSERT INTO t_nopk VALUES (10, 'after_truncate'); +SELECT * FROM t_nopk; +a b +10 after_truncate +# +# Cleanup +# +DROP TABLE t_nopk, t_nopk_idx, t_nopk_blob; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_index_stats.result b/mysql-test/suite/tidesdb/r/tidesdb_index_stats.result new file mode 100644 index 0000000000000..a0126333fde11 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_index_stats.result @@ -0,0 +1,106 @@ +# +# ============================================ +# TEST 1: Index type reporting (issue #78) +# LSM tables should show LSM, not BTREE +# ============================================ +# +CREATE TABLE t_lsm ( +i INT NOT NULL PRIMARY KEY, +y INT, +KEY idx_y (y) +) ENGINE=TIDESDB USE_BTREE=0; +SHOW KEYS FROM t_lsm; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_lsm 0 PRIMARY 1 i A 2 NULL NULL LSM NO +t_lsm 1 idx_y 1 y A 2 NULL NULL YES LSM NO +DROP TABLE t_lsm; +# +# ============================================ +# TEST 2: BTREE tables should show BTREE +# ============================================ +# +CREATE TABLE t_btree ( +i INT NOT NULL PRIMARY KEY, +y INT, +KEY idx_y (y) +) ENGINE=TIDESDB USE_BTREE=1; +SHOW KEYS FROM t_btree; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_btree 0 PRIMARY 1 i A 2 NULL NULL BTREE NO +t_btree 1 idx_y 1 y A 2 NULL NULL YES BTREE NO +DROP TABLE t_btree; +# +# ============================================ +# TEST 3: Default (USE_BTREE=0) shows LSM +# ============================================ +# +CREATE TABLE t_default ( +i INT NOT NULL PRIMARY KEY, +y INT, +KEY idx_y (y) +) ENGINE=TIDESDB; +SHOW KEYS FROM t_default; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_default 0 PRIMARY 1 i A 2 NULL NULL LSM NO +t_default 1 idx_y 1 y A 2 NULL NULL YES LSM NO +DROP TABLE t_default; +# +# ============================================ +# TEST 4: ANALYZE TABLE updates rec_per_key +# for non-unique secondary indexes (issue #74) +# ============================================ +# +CREATE TABLE t_stats ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +k INT NOT NULL, +val VARCHAR(50), +KEY k_idx (k) +) ENGINE=TIDESDB; +# Insert 200 rows with only 2 distinct values for k +SELECT COUNT(*) AS total_rows FROM t_stats; +total_rows +200 +# Before ANALYZE, optimizer may not estimate well +EXPLAIN SELECT * FROM t_stats WHERE k = 0; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_stats ref k_idx k_idx 4 const 1 +ANALYZE TABLE t_stats; +Table Op Msg_type Msg_text +test.t_stats analyze status Engine-independent statistics collected +test.t_stats analyze Note [TIDESDB] CF 'test__t_stats' total_keys=N data_size=N bytes memtable=N bytes levels=1 read_amp=N cache_hit=N% +test.t_stats analyze Note [TIDESDB] avg_key=N bytes avg_value=N bytes +test.t_stats analyze Note [TIDESDB] level 1 sstables=N size=N bytes keys=N +test.t_stats analyze Note [TIDESDB] idx CF 'test__t_stats__idx_k_idx' keys=N data_size=N bytes levels=1 +test.t_stats analyze Note [TIDESDB] idx 'k_idx' sampled=N distinct=N rec_per_key=N +test.t_stats analyze status OK +# After ANALYZE, the optimizer should estimate ~100 rows for k=0 +EXPLAIN SELECT * FROM t_stats WHERE k = 0; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_stats ref k_idx k_idx 4 const 2 +DROP TABLE t_stats; +# +# ============================================ +# TEST 5: ANALYZE with highly selective index +# ============================================ +# +CREATE TABLE t_stats2 ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +code INT NOT NULL, +KEY code_idx (code) +) ENGINE=TIDESDB; +ANALYZE TABLE t_stats2; +Table Op Msg_type Msg_text +test.t_stats2 analyze status Engine-independent statistics collected +test.t_stats2 analyze Note [TIDESDB] CF 'test__t_stats2' total_keys=N data_size=N bytes memtable=N bytes levels=1 read_amp=N cache_hit=N% +test.t_stats2 analyze Note [TIDESDB] avg_key=N bytes avg_value=N bytes +test.t_stats2 analyze Note [TIDESDB] level 1 sstables=N size=N bytes keys=N +test.t_stats2 analyze Note [TIDESDB] idx CF 'test__t_stats2__idx_code_idx' keys=N data_size=N bytes levels=1 +test.t_stats2 analyze Note [TIDESDB] idx 'code_idx' sampled=N distinct=N rec_per_key=N +test.t_stats2 analyze status OK +# With 100 distinct values in 100 rows, rec_per_key should be ~1 +EXPLAIN SELECT * FROM t_stats2 WHERE code = 50; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_stats2 ref code_idx code_idx 4 const 1 Using index +DROP TABLE t_stats2; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_info_schema.result b/mysql-test/suite/tidesdb/r/tidesdb_info_schema.result new file mode 100644 index 0000000000000..fe7a87baf5443 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_info_schema.result @@ -0,0 +1,32 @@ +# ---- setup ---- +CREATE TABLE t_info_schema ( +id INT PRIMARY KEY, +val VARCHAR(200) +) ENGINE=TidesDB; +INSERT INTO t_info_schema VALUES (1, REPEAT('a', 100)); +INSERT INTO t_info_schema VALUES (2, REPEAT('b', 100)); +INSERT INTO t_info_schema VALUES (3, REPEAT('c', 100)); +# ---- data_length must be non-zero ---- +FAIL: DATA_LENGTH is 0 +# ---- table_rows must reflect inserted rows ---- +FAIL: TABLE_ROWS < 3 +# ---- add secondary index and check index_length ---- +ALTER TABLE t_info_schema ADD INDEX idx_val (val); +SELECT COUNT(*) FROM t_info_schema; +COUNT(*) +3 +FAIL: INDEX_LENGTH is 0 +# ---- verify after bulk insert ---- +SELECT COUNT(*) FROM t_info_schema; +COUNT(*) +200 +FAIL: DATA_LENGTH is 0 after bulk insert +# ---- create_time must be non-null ---- +OK: CREATE_TIME is set +# ---- update_time must be non-null after DML ---- +OK: UPDATE_TIME is set +# ---- update_time advances after more DML ---- +INSERT INTO t_info_schema VALUES (9999, 'timestamp_test'); +OK: UPDATE_TIME advanced after INSERT +# ---- cleanup ---- +DROP TABLE t_info_schema; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_insert_conflict.result b/mysql-test/suite/tidesdb/r/tidesdb_insert_conflict.result new file mode 100644 index 0000000000000..207de7da91998 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_insert_conflict.result @@ -0,0 +1,36 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +# +# Issue #83: INSERT vs INSERT conflict detection +# +CREATE TABLE t ( +a INT NOT NULL PRIMARY KEY, +b INT +) ENGINE=TidesDB; +connect con1, localhost, root,,; +connect con2, localhost, root,,; +# ---- TEST: Two INSERTs with same PK ---- +connection con1; +START TRANSACTION; +INSERT INTO t VALUES (1, 10); +connection con2; +START TRANSACTION; +INSERT INTO t VALUES (1, 500); +COMMIT; +connection con1; +# con1 should get conflict error -- con2 committed first +COMMIT; +Got one of the listed errors +connection default; +# con2 wins: b should be 500 +SELECT * FROM t; +a b +1 500 +# Cleanup +connection con1; +disconnect con1; +connection con2; +disconnect con2; +connection default; +DROP TABLE t; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_isolation.result b/mysql-test/suite/tidesdb/r/tidesdb_isolation.result new file mode 100644 index 0000000000000..0445e61238674 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_isolation.result @@ -0,0 +1,115 @@ +# +# ============================================ +# TEST 1: READ COMMITTED - sees committed data +# ============================================ +# +CREATE TABLE t_iso ( +id INT NOT NULL PRIMARY KEY, +val INT +) ENGINE=TIDESDB; +INSERT INTO t_iso VALUES (1, 10); +connect con1, localhost, root,,; +connection con1; +SET TRANSACTION ISOLATION LEVEL READ COMMITTED; +BEGIN; +SELECT * FROM t_iso ORDER BY id; +id val +1 10 +connection default; +INSERT INTO t_iso VALUES (2, 20); +# con1 at READ COMMITTED should see newly committed row +connection con1; +SELECT * FROM t_iso ORDER BY id; +id val +1 10 +2 20 +COMMIT; +disconnect con1; +connection default; +# +# ============================================ +# TEST 2: REPEATABLE READ - snapshot isolation +# ============================================ +# +connect con2, localhost, root,,; +connection con2; +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +SELECT * FROM t_iso ORDER BY id; +id val +1 10 +2 20 +connection default; +INSERT INTO t_iso VALUES (3, 30); +# con2 at REPEATABLE READ should NOT see row 3 +connection con2; +SELECT * FROM t_iso ORDER BY id; +id val +1 10 +2 20 +COMMIT; +# After COMMIT, new transaction should see row 3 +SELECT * FROM t_iso ORDER BY id; +id val +1 10 +2 20 +3 30 +disconnect con2; +connection default; +# +# ============================================ +# TEST 3: Basic DML at each isolation level +# (verifies the mapping doesn't crash) +# ============================================ +# +SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +INSERT INTO t_iso VALUES (4, 40); +SELECT * FROM t_iso WHERE id = 4; +id val +4 40 +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +UPDATE t_iso SET val = 41 WHERE id = 4; +SELECT * FROM t_iso WHERE id = 4; +id val +4 41 +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +UPDATE t_iso SET val = 42 WHERE id = 4; +SELECT * FROM t_iso WHERE id = 4; +id val +4 42 +SET SESSION TRANSACTION ISOLATION LEVEL SERIALIZABLE; +DELETE FROM t_iso WHERE id = 4; +SELECT * FROM t_iso ORDER BY id; +id val +1 10 +2 20 +3 30 +# Reset to default +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +DROP TABLE t_iso; +# +# ============================================ +# TEST 4: SNAPSHOT isolation via table option +# (table uses ISOLATION_LEVEL=SNAPSHOT, session +# at REPEATABLE READ should activate SNAPSHOT) +# ============================================ +# +CREATE TABLE t_snap ( +id INT NOT NULL PRIMARY KEY, +val INT +) ENGINE=TIDESDB ISOLATION_LEVEL='SNAPSHOT'; +INSERT INTO t_snap VALUES (1, 100); +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +SELECT * FROM t_snap ORDER BY id; +id val +1 100 +INSERT INTO t_snap VALUES (2, 200); +SELECT * FROM t_snap ORDER BY id; +id val +1 100 +2 200 +COMMIT; +DROP TABLE t_snap; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_isolation_table_option.result b/mysql-test/suite/tidesdb/r/tidesdb_isolation_table_option.result new file mode 100644 index 0000000000000..5442ac6a3ed3f --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_isolation_table_option.result @@ -0,0 +1,43 @@ +CREATE TABLE t_snap (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_snap VALUES (1, 10); +CREATE TABLE t_rc (id INT PRIMARY KEY, v INT) +ENGINE=TidesDB `ISOLATION_LEVEL`=READ_COMMITTED; +INSERT INTO t_rc VALUES (1, 10); +connect con1, localhost, root,,test; +connection con1; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +# +# default table -- the transaction holds a stable snapshot +# +BEGIN; +SELECT id, v FROM t_snap ORDER BY id; +id v +1 10 +connection default; +INSERT INTO t_snap VALUES (2, 20); +connection con1; +# the snapshot is stable, so the row committed afterwards is unseen +SELECT id, v FROM t_snap ORDER BY id; +id v +1 10 +COMMIT; +# +# ISOLATION_LEVEL=READ_COMMITTED -- the transaction sees fresh commits +# +BEGIN; +SELECT id, v FROM t_rc ORDER BY id; +id v +1 10 +connection default; +INSERT INTO t_rc VALUES (2, 20); +connection con1; +# read committed sees the row committed after the transaction began +SELECT id, v FROM t_rc ORDER BY id; +id v +1 10 +2 20 +COMMIT; +connection default; +disconnect con1; +DROP TABLE t_snap, t_rc; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_json.result b/mysql-test/suite/tidesdb/r/tidesdb_json.result new file mode 100644 index 0000000000000..a163e62673943 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_json.result @@ -0,0 +1,50 @@ +# +# ============================================ +# TEST: JSON querying + generated column indexing +# ============================================ +# +CREATE TABLE t_json ( +id INT NOT NULL PRIMARY KEY, +data LONGTEXT, +name VARCHAR(50) AS (JSON_VALUE(data, '$.name')) PERSISTENT, +age INT AS (JSON_VALUE(data, '$.age')) PERSISTENT, +KEY idx_name (name), +KEY idx_age (age) +) ENGINE=TIDESDB; +INSERT INTO t_json (id, data) VALUES +(1, '{"name":"Alice","age":30,"tags":["admin","dev"]}'), +(2, '{"name":"Bob","age":25,"tags":["dev"]}'), +(3, '{"name":"Carol","age":40,"tags":["finance"]}'); +# Basic JSON extraction +SELECT id, JSON_VALUE(data, '$.name') AS jname, JSON_VALUE(data, '$.age') AS jage +FROM t_json ORDER BY id; +id jname jage +1 Alice 30 +2 Bob 25 +3 Carol 40 +# Generated columns reflect JSON paths +SELECT id, name, age FROM t_json ORDER BY id; +id name age +1 Alice 30 +2 Bob 25 +3 Carol 40 +# Filter using generated columns (indexable JSON paths) +SELECT id, name, age FROM t_json WHERE name='Alice' ORDER BY id; +id name age +1 Alice 30 +SELECT id, name, age FROM t_json WHERE age >= 30 ORDER BY id; +id name age +1 Alice 30 +3 Carol 40 +# Filter using JSON function (non-indexed expression) +SELECT id FROM t_json WHERE JSON_CONTAINS(data, '"admin"', '$.tags') ORDER BY id; +id +1 +# Update JSON and verify generated columns update +UPDATE t_json SET data = JSON_SET(data, '$.age', 31) WHERE id = 1; +SELECT id, name, age FROM t_json WHERE id = 1; +id name age +1 Alice 31 +DROP TABLE t_json; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_large_blob.result b/mysql-test/suite/tidesdb/r/tidesdb_large_blob.result new file mode 100644 index 0000000000000..456e399717ba3 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_large_blob.result @@ -0,0 +1,63 @@ +# +# TEST 1: Large TEXT insert and retrieval +# +CREATE TABLE t_blob (id INT PRIMARY KEY, data LONGTEXT) ENGINE=TidesDB; +INSERT INTO t_blob VALUES (1, REPEAT('A', 1000)); +INSERT INTO t_blob VALUES (2, REPEAT('B', 65536)); +INSERT INTO t_blob VALUES (3, REPEAT('C', 262144)); +SELECT id, LENGTH(data) FROM t_blob ORDER BY id; +id LENGTH(data) +1 1000 +2 65536 +3 262144 +# +# TEST 2: Large BLOB with secondary index +# +CREATE TABLE t_blob_idx ( +id INT PRIMARY KEY, +cat INT, +payload LONGBLOB, +KEY(cat) +) ENGINE=TidesDB; +INSERT INTO t_blob_idx VALUES (1, 10, REPEAT('X', 100000)); +INSERT INTO t_blob_idx VALUES (2, 20, REPEAT('Y', 100000)); +INSERT INTO t_blob_idx VALUES (3, 10, REPEAT('Z', 100000)); +SELECT id, LENGTH(payload) FROM t_blob_idx WHERE cat = 10 ORDER BY id; +id LENGTH(payload) +1 100000 +3 100000 +# +# TEST 3: UPDATE large BLOB +# +UPDATE t_blob SET data = REPEAT('D', 500000) WHERE id = 2; +SELECT id, LENGTH(data) FROM t_blob WHERE id = 2; +id LENGTH(data) +2 500000 +# +# TEST 4: DELETE and re-insert large BLOB +# +DELETE FROM t_blob WHERE id = 3; +INSERT INTO t_blob VALUES (3, REPEAT('E', 131072)); +SELECT id, LENGTH(data) FROM t_blob ORDER BY id; +id LENGTH(data) +1 1000 +2 500000 +3 131072 +# +# TEST 5: Multiple BLOB columns +# +CREATE TABLE t_multi_blob ( +id INT PRIMARY KEY, +a LONGBLOB, +b LONGTEXT, +c MEDIUMBLOB +) ENGINE=TidesDB; +INSERT INTO t_multi_blob VALUES (1, REPEAT('A', 80000), REPEAT('B', 80000), REPEAT('C', 40000)); +SELECT id, LENGTH(a), LENGTH(b), LENGTH(c) FROM t_multi_blob; +id LENGTH(a) LENGTH(b) LENGTH(c) +1 80000 80000 40000 +# +# Cleanup +# +DROP TABLE t_blob, t_blob_idx, t_multi_blob; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_load_data.result b/mysql-test/suite/tidesdb/r/tidesdb_load_data.result new file mode 100644 index 0000000000000..54c500fe37adc --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_load_data.result @@ -0,0 +1,65 @@ +# +# TEST 1: Multi-row INSERT (triggers bulk insert path) +# +CREATE TABLE t_bulk (id INT PRIMARY KEY, name VARCHAR(100), val INT) ENGINE=TidesDB; +INSERT INTO t_bulk VALUES +(1, 'alpha', 100), (2, 'beta', 200), (3, 'gamma', 300), +(4, 'delta', 400), (5, 'epsilon', 500); +SELECT * FROM t_bulk ORDER BY id; +id name val +1 alpha 100 +2 beta 200 +3 gamma 300 +4 delta 400 +5 epsilon 500 +# +# TEST 2: INSERT ... SELECT bulk load +# +CREATE TABLE t_source (id INT PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_source VALUES (1,'a'), (2,'b'), (3,'c'), (4,'d'), (5,'e'), +(6,'f'), (7,'g'), (8,'h'), (9,'i'), (10,'j'); +CREATE TABLE t_dest (id INT PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_dest SELECT * FROM t_source; +SELECT COUNT(*) FROM t_dest; +COUNT(*) +10 +# +# TEST 3: Large bulk insert (200+ rows, triggers batch commit) +# +CREATE TABLE t_large (id INT PRIMARY KEY, payload VARCHAR(200)) ENGINE=TidesDB; +SELECT COUNT(*) AS total FROM t_large; +total +200 +SELECT MIN(id), MAX(id) FROM t_large; +MIN(id) MAX(id) +1 200 +# +# TEST 4: Bulk insert with secondary index +# +CREATE TABLE t_bulk_idx (id INT PRIMARY KEY, cat INT, KEY(cat)) ENGINE=TidesDB; +INSERT INTO t_bulk_idx VALUES +(1, 10), (2, 20), (3, 10), (4, 30), (5, 10), +(6, 20), (7, 10), (8, 30), (9, 10), (10, 20); +SELECT COUNT(*) FROM t_bulk_idx WHERE cat = 10; +COUNT(*) +5 +SELECT COUNT(*) FROM t_bulk_idx WHERE cat = 20; +COUNT(*) +3 +# +# TEST 5: INSERT ... SELECT between TidesDB tables +# +CREATE TABLE t_src2 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_src2 VALUES (1,10), (2,20), (3,30); +CREATE TABLE t_dst2 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_dst2 SELECT * FROM t_src2; +SELECT * FROM t_dst2 ORDER BY id; +id v +1 10 +2 20 +3 30 +# +# Cleanup +# +DROP TABLE t_bulk, t_source, t_dest, t_large, t_bulk_idx, t_src2, t_dst2; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_max_concurrent_flushes.result b/mysql-test/suite/tidesdb/r/tidesdb_max_concurrent_flushes.result new file mode 100644 index 0000000000000..a5f21f01697b1 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_max_concurrent_flushes.result @@ -0,0 +1,8 @@ +call mtr.add_suppression("\\[TIDESDB\\] tidesdb_max_concurrent_flushes=.* is lower than tidesdb_flush_threads="); +SELECT @@global.tidesdb_flush_threads AS flush_threads, +@@global.tidesdb_max_concurrent_flushes AS max_concurrent_flushes; +flush_threads max_concurrent_flushes +4 2 +# the server error log carries the misalignment warning +FOUND 1 /tidesdb_max_concurrent_flushes=2 is lower than tidesdb_flush_threads=4/ in mysqld.1.err +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_mixed_engine.result b/mysql-test/suite/tidesdb/r/tidesdb_mixed_engine.result new file mode 100644 index 0000000000000..4c30cd055eea0 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_mixed_engine.result @@ -0,0 +1,75 @@ +# +# TEST 1: Cross-engine transaction commit +# +CREATE TABLE t_tdb (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +CREATE TABLE t_inn (id INT PRIMARY KEY, v INT) ENGINE=InnoDB; +BEGIN; +INSERT INTO t_tdb VALUES (1, 100); +INSERT INTO t_inn VALUES (1, 100); +INSERT INTO t_tdb VALUES (2, 200); +INSERT INTO t_inn VALUES (2, 200); +COMMIT; +SELECT * FROM t_tdb ORDER BY id; +id v +1 100 +2 200 +SELECT * FROM t_inn ORDER BY id; +id v +1 100 +2 200 +# +# TEST 2: Cross-engine transaction rollback +# +BEGIN; +INSERT INTO t_tdb VALUES (3, 300); +INSERT INTO t_inn VALUES (3, 300); +ROLLBACK; +SELECT COUNT(*) AS tdb_count FROM t_tdb; +tdb_count +2 +SELECT COUNT(*) AS inn_count FROM t_inn; +inn_count +2 +# +# TEST 3: Cross-engine JOIN query +# +INSERT INTO t_tdb VALUES (3, 300); +INSERT INTO t_inn VALUES (3, 999); +SELECT a.id, a.v AS tdb_val, b.v AS inn_val +FROM t_tdb a JOIN t_inn b ON a.id = b.id +ORDER BY a.id; +id tdb_val inn_val +1 100 100 +2 200 200 +3 300 999 +# +# TEST 4: INSERT ... SELECT across engines +# +CREATE TABLE t_tdb2 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_tdb2 SELECT * FROM t_inn; +SELECT * FROM t_tdb2 ORDER BY id; +id v +1 100 +2 200 +3 999 +CREATE TABLE t_inn2 (id INT PRIMARY KEY, v INT) ENGINE=InnoDB; +INSERT INTO t_inn2 SELECT * FROM t_tdb; +SELECT * FROM t_inn2 ORDER BY id; +id v +1 100 +2 200 +3 300 +# +# TEST 5: Multi-table UPDATE across engines +# +UPDATE t_tdb a JOIN t_inn b ON a.id = b.id +SET a.v = a.v + 1, b.v = b.v + 1 +WHERE a.id = 1; +SELECT a.v AS tdb_v, b.v AS inn_v FROM t_tdb a, t_inn b WHERE a.id = 1 AND b.id = 1; +tdb_v inn_v +101 101 +# +# Cleanup +# +DROP TABLE t_tdb, t_inn, t_tdb2, t_inn2; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_mrr.result b/mysql-test/suite/tidesdb/r/tidesdb_mrr.result new file mode 100644 index 0000000000000..bea30b41b2adb --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_mrr.result @@ -0,0 +1,85 @@ +SET @saved_opt_switch = @@optimizer_switch; +SET optimizer_switch = 'mrr=on,mrr_sort_keys=on,mrr_cost_based=off'; +# +# TEST 1: IN (...) on PK (clustered-style point lookups) +# +CREATE TABLE t_pk (id INT PRIMARY KEY, v VARCHAR(20)) ENGINE=TidesDB; +INSERT INTO t_pk VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e'), +(6,'f'),(7,'g'),(8,'h'),(9,'i'),(10,'j'); +# Confirm the optimizer actually picks Rowid-ordered scan (MRR). +EXPLAIN SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_pk range PRIMARY # 4 NULL 2 Using where +# Unsorted IN-list; MRR must still return the right rows. +SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5) ORDER BY id; +id v +2 b +3 c +5 e +7 g +9 i +# Mix of hits and misses -- missing IDs are silently skipped. +SELECT * FROM t_pk WHERE id IN (11, 4, 99, 1, 42) ORDER BY id; +id v +1 a +4 d +# Single-element IN is still routed through MRR. +SELECT * FROM t_pk WHERE id IN (6); +id v +6 f +# +# TEST 2: IN (...) on a unique secondary index +# +CREATE TABLE t_uk ( +id INT PRIMARY KEY, +code INT, +v VARCHAR(20), +UNIQUE KEY u_code (code) +) ENGINE=TidesDB; +INSERT INTO t_uk VALUES (1,100,'a'),(2,200,'b'),(3,300,'c'),(4,400,'d'),(5,500,'e'); +SELECT * FROM t_uk WHERE code IN (300, 100, 500) ORDER BY code; +id code v +1 100 a +3 300 c +5 500 e +SELECT * FROM t_uk WHERE code IN (999, 200, 111) ORDER BY code; +id code v +2 200 b +# +# TEST 3: Large unsorted IN-list (sort-then-seek should still be correct) +# +CREATE TABLE t_big (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +SELECT COUNT(*), MIN(id), MAX(id) FROM t_big +WHERE id IN (37, 199, 2, 88, 150, 1, 73, 112, 200, 5); +COUNT(*) MIN(id) MAX(id) +10 1 200 +# EXPLAIN should mention MRR in Extra for a 10-value IN on a 200-row table. +EXPLAIN SELECT * FROM t_big +WHERE id IN (37, 199, 2, 88, 150, 1, 73, 112, 200, 5); +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_big range PRIMARY # 4 NULL # Using where +# +# TEST 4: Result is consistent with / without MRR +# +SET optimizer_switch = 'mrr=off'; +SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5) ORDER BY id; +id v +2 b +3 c +5 e +7 g +9 i +SET optimizer_switch = 'mrr=on,mrr_sort_keys=on,mrr_cost_based=off'; +SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5) ORDER BY id; +id v +2 b +3 c +5 e +7 g +9 i +# +# Cleanup +# +DROP TABLE t_pk, t_uk, t_big; +SET optimizer_switch = @saved_opt_switch; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_mvcc_concurrent_update.result b/mysql-test/suite/tidesdb/r/tidesdb_mvcc_concurrent_update.result new file mode 100644 index 0000000000000..d2e57a2e0a6dd --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_mvcc_concurrent_update.result @@ -0,0 +1,32 @@ +call mtr.add_suppression("\\[TIDESDB\\].*hton_commit: tidesdb_txn_commit returned"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +DROP TABLE IF EXISTS district; +DROP TABLE IF EXISTS txn_log; +CREATE TABLE district ( +d_w_id INT NOT NULL, +d_id INT NOT NULL, +d_next_o_id INT NOT NULL, +PRIMARY KEY (d_w_id, d_id) +) ENGINE=TidesDB; +CREATE TABLE txn_log ( +id BIGINT NOT NULL AUTO_INCREMENT, +vu INT NOT NULL, +ts BIGINT NOT NULL, +PRIMARY KEY (id) +) ENGINE=TidesDB; +INSERT INTO district VALUES (1, 1, 3001); +SELECT +d_next_o_id - 3001 AS counter_delta, +(SELECT COUNT(*) FROM txn_log) AS commits_logged, +CASE +WHEN d_next_o_id - 3001 = (SELECT COUNT(*) FROM txn_log) +THEN 'OK' + WHEN d_next_o_id - 3001 < (SELECT COUNT(*) FROM txn_log) +THEN 'LOST_UPDATE' + ELSE 'PHANTOM_INCREMENT' + END AS verdict +FROM district WHERE d_w_id=1 AND d_id=1; +counter_delta commits_logged verdict +# # OK +DROP TABLE district; +DROP TABLE txn_log; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_object_store.result b/mysql-test/suite/tidesdb/r/tidesdb_object_store.result new file mode 100644 index 0000000000000..6d5c57c42066d --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_object_store.result @@ -0,0 +1,85 @@ +# +# TEST 1: Basic CRUD over object store +# +CREATE TABLE t_obj ( +id INT NOT NULL PRIMARY KEY, +name VARCHAR(100), +data TEXT +) ENGINE=TidesDB; +INSERT INTO t_obj VALUES (1, 'alpha', REPEAT('A', 500)); +INSERT INTO t_obj VALUES (2, 'beta', REPEAT('B', 500)); +INSERT INTO t_obj VALUES (3, 'gamma', REPEAT('C', 500)); +INSERT INTO t_obj VALUES (4, 'delta', REPEAT('D', 500)); +INSERT INTO t_obj VALUES (5, 'epsilon', REPEAT('E', 500)); +SELECT id, name, LENGTH(data) FROM t_obj ORDER BY id; +id name LENGTH(data) +1 alpha 500 +2 beta 500 +3 gamma 500 +4 delta 500 +5 epsilon 500 +# +# TEST 2: UPDATE and DELETE +# +UPDATE t_obj SET name = 'ALPHA', data = REPEAT('X', 1000) WHERE id = 1; +DELETE FROM t_obj WHERE id = 3; +SELECT id, name, LENGTH(data) FROM t_obj ORDER BY id; +id name LENGTH(data) +1 ALPHA 1000 +2 beta 500 +4 delta 500 +5 epsilon 500 +# +# TEST 3: Secondary index over object store +# +CREATE TABLE t_idx ( +id INT NOT NULL PRIMARY KEY, +category INT NOT NULL, +val VARCHAR(200), +KEY idx_cat (category) +) ENGINE=TidesDB; +INSERT INTO t_idx VALUES (1, 10, 'widget'), (2, 20, 'gadget'), (3, 10, 'sprocket'); +INSERT INTO t_idx VALUES (4, 30, 'gizmo'), (5, 10, 'doohickey'); +SELECT id, val FROM t_idx WHERE category = 10 ORDER BY id; +id val +1 widget +3 sprocket +5 doohickey +# +# TEST 4: Transaction commit and rollback +# +BEGIN; +INSERT INTO t_obj VALUES (10, 'txn_test', 'committed'); +COMMIT; +BEGIN; +INSERT INTO t_obj VALUES (11, 'txn_rollback', 'should_not_exist'); +ROLLBACK; +SELECT id, name FROM t_obj WHERE id >= 10 ORDER BY id; +id name +10 txn_test +# +# TEST 5: Bulk insert (triggers flush to SSTables -> S3 upload) +# +CREATE TABLE t_bulk ( +id INT NOT NULL PRIMARY KEY, +payload VARCHAR(500) +) ENGINE=TidesDB; +SELECT COUNT(*) AS bulk_count FROM t_bulk; +bulk_count +200 +# +# TEST 6: OPTIMIZE TABLE (triggers compaction -> S3 re-upload) +# +OPTIMIZE TABLE t_bulk; +Table Op Msg_type Msg_text +test.t_bulk optimize status OK +SELECT COUNT(*) AS after_optimize FROM t_bulk; +after_optimize +200 +# +# Cleanup +# +DROP TABLE t_obj; +DROP TABLE t_idx; +DROP TABLE t_bulk; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_online_ddl.result b/mysql-test/suite/tidesdb/r/tidesdb_online_ddl.result new file mode 100644 index 0000000000000..362fc134110d8 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_online_ddl.result @@ -0,0 +1,188 @@ +# ---- Setup ---- +CREATE TABLE t_ddl ( +id INT PRIMARY KEY, +a INT, +b VARCHAR(100), +c INT DEFAULT 0 +) ENGINE=TidesDB; +INSERT INTO t_ddl VALUES (1, 10, 'alpha', 100); +INSERT INTO t_ddl VALUES (2, 20, 'beta', 200); +INSERT INTO t_ddl VALUES (3, 30, 'gamma', 300); +INSERT INTO t_ddl VALUES (4, 10, 'delta', 400); +INSERT INTO t_ddl VALUES (5, 50, 'epsilon', 500); +# ---- INSTANT: change column default ---- +ALTER TABLE t_ddl ALTER COLUMN c SET DEFAULT 999, ALGORITHM=INSTANT; +INSERT INTO t_ddl (id, a, b) VALUES (6, 60, 'zeta'); +SELECT id, c FROM t_ddl WHERE id = 6; +id c +6 999 +# ---- INSTANT: rename column ---- +ALTER TABLE t_ddl CHANGE b b_name VARCHAR(100), ALGORITHM=INSTANT; +SELECT id, b_name FROM t_ddl WHERE id = 1; +id b_name +1 alpha +# ---- INSTANT: change table option (SYNC_MODE) ---- +ALTER TABLE t_ddl SYNC_MODE='NONE', ALGORITHM=INSTANT; +SHOW CREATE TABLE t_ddl; +Table Create Table +t_ddl CREATE TABLE `t_ddl` ( + `id` int(11) NOT NULL, + `a` int(11) DEFAULT NULL, + `b_name` varchar(100) DEFAULT NULL, + `c` int(11) DEFAULT 999, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `SYNC_MODE`='NONE' +# ---- INPLACE: add secondary index ---- +ALTER TABLE t_ddl ADD INDEX idx_a (a), ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_ddl 0 PRIMARY 1 id A 2 NULL NULL LSM NO +t_ddl 1 idx_a 1 a A 2 NULL NULL YES LSM NO +# Verify index is usable +SELECT id, a FROM t_ddl WHERE a = 10 ORDER BY id; +id a +1 10 +4 10 +SELECT id, a FROM t_ddl WHERE a >= 30 ORDER BY a; +id a +3 30 +5 50 +6 60 +# ---- INPLACE: add another index ---- +ALTER TABLE t_ddl ADD INDEX idx_c (c), ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_ddl 0 PRIMARY 1 id A 2 NULL NULL LSM NO +t_ddl 1 idx_a 1 a A 2 NULL NULL YES LSM NO +t_ddl 1 idx_c 1 c A 2 NULL NULL YES LSM NO +EXPLAIN SELECT id, c FROM t_ddl WHERE c = 200; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ddl ref idx_c idx_c 5 const 1 Using index +SELECT id, c FROM t_ddl WHERE c = 200; +id c +2 200 +# ---- INPLACE: drop index ---- +ALTER TABLE t_ddl DROP INDEX idx_a, ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_ddl 0 PRIMARY 1 id A 2 NULL NULL LSM NO +t_ddl 1 idx_c 1 c A 2 NULL NULL YES LSM NO +# Verify remaining index still works +SELECT id, c FROM t_ddl WHERE c = 300; +id c +3 300 +# ---- INPLACE: add + drop in one statement ---- +ALTER TABLE t_ddl ADD INDEX idx_a2 (a), DROP INDEX idx_c, ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t_ddl 0 PRIMARY 1 id A 2 NULL NULL LSM NO +t_ddl 1 idx_a2 1 a A 2 NULL NULL YES LSM NO +EXPLAIN SELECT id, a FROM t_ddl WHERE a = 20; +id select_type table type possible_keys key key_len ref rows Extra +1 SIMPLE t_ddl ref idx_a2 idx_a2 5 const 1 Using index +SELECT id, a FROM t_ddl WHERE a = 20; +id a +2 20 +# ---- INSTANT: add column (NOT NULL DEFAULT) ---- +ALTER TABLE t_ddl ADD COLUMN d INT NOT NULL DEFAULT 0, ALGORITHM=INSTANT; +SELECT id, d FROM t_ddl WHERE id = 1; +id d +1 0 +# ---- Verify old rows readable after ADD COLUMN ---- +SELECT id, a, b_name, c, d FROM t_ddl ORDER BY id; +id a b_name c d +1 10 alpha 100 0 +2 20 beta 200 0 +3 30 gamma 300 0 +4 10 delta 400 0 +5 50 epsilon 500 0 +6 60 zeta 999 0 +# ---- Insert with new schema and verify ---- +INSERT INTO t_ddl VALUES (7, 70, 'eta', 700, 42); +SELECT id, d FROM t_ddl WHERE id IN (1, 7) ORDER BY id; +id d +1 0 +7 42 +# ---- INSTANT: drop column ---- +ALTER TABLE t_ddl DROP COLUMN d, ALGORITHM=INSTANT; +SELECT * FROM t_ddl WHERE id = 1; +id a b_name c +1 10 alpha 100 +# ---- Verify all rows readable after DROP COLUMN ---- +SELECT id, a, b_name, c FROM t_ddl ORDER BY id; +id a b_name c +1 10 alpha 100 +2 20 beta 200 +3 30 gamma 300 +4 10 delta 400 +5 50 epsilon 500 +6 60 zeta 999 +7 70 eta 700 +# ---- Cleanup ---- +DROP TABLE t_ddl; +# ---- Test with data and hidden PK (no explicit PK) ---- +CREATE TABLE t_nopk ( +a INT, +b VARCHAR(50) +) ENGINE=TidesDB; +INSERT INTO t_nopk VALUES (1, 'one'); +INSERT INTO t_nopk VALUES (2, 'two'); +INSERT INTO t_nopk VALUES (3, 'three'); +# Add index on hidden-PK table +ALTER TABLE t_nopk ADD INDEX idx_a (a), ALGORITHM=INPLACE; +SELECT a, b FROM t_nopk WHERE a = 2; +a b +2 two +# Drop it +ALTER TABLE t_nopk DROP INDEX idx_a, ALGORITHM=INPLACE; +DROP TABLE t_nopk; +# ---- ADD UNIQUE must reject duplicates ---- +CREATE TABLE t_dup ( +i INT NOT NULL, +j INT NOT NULL DEFAULT 0 +) ENGINE=TidesDB; +INSERT INTO t_dup VALUES (1, 0); +INSERT INTO t_dup VALUES (2, 0); +SELECT * FROM t_dup ORDER BY i; +i j +1 0 +2 0 +ALTER TABLE t_dup ADD UNIQUE unq_j (j); +ERROR 23000: Duplicate entry '0' for key 'unq_j' +SELECT * FROM t_dup ORDER BY i; +i j +1 0 +2 0 +SELECT COUNT(*) FROM t_dup; +COUNT(*) +2 +DROP TABLE t_dup; +# ---- ADD FULLTEXT must back-fill pre-existing rows ---- +CREATE TABLE t_ft ( +id INT PRIMARY KEY, +body VARCHAR(200) +) ENGINE=TidesDB; +INSERT INTO t_ft VALUES (1, 'tides db rocks'), (2, 'sql plugin lives'), (3, 'tides again'); +ALTER TABLE t_ft ADD FULLTEXT (body), ALGORITHM=INPLACE; +ERROR 0A000: ALGORITHM=INPLACE is not supported. Reason: TidesDB cannot add FULLTEXT index inplace. Try ALGORITHM=COPY +ALTER TABLE t_ft ADD FULLTEXT (body); +SELECT id FROM t_ft WHERE MATCH(body) AGAINST('tides') ORDER BY id; +id +1 +3 +DROP TABLE t_ft; +# ---- ADD SPATIAL must back-fill pre-existing rows ---- +CREATE TABLE t_sp ( +id INT PRIMARY KEY, +g GEOMETRY NOT NULL +) ENGINE=TidesDB; +INSERT INTO t_sp VALUES (1, ST_GeomFromText('POINT(0 0)')); +INSERT INTO t_sp VALUES (2, ST_GeomFromText('POINT(10 10)')); +ALTER TABLE t_sp ADD SPATIAL INDEX (g), ALGORITHM=INPLACE; +ERROR 0A000: ALGORITHM=INPLACE is not supported. Reason: TidesDB cannot add SPATIAL index inplace. Try ALGORITHM=COPY +ALTER TABLE t_sp ADD SPATIAL INDEX (g); +SELECT id FROM t_sp WHERE MBRWithin(g, ST_GeomFromText('POLYGON((-1 -1, -1 5, 5 5, 5 -1, -1 -1))')) +ORDER BY id; +id +1 +DROP TABLE t_sp; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_options.result b/mysql-test/suite/tidesdb/r/tidesdb_options.result new file mode 100644 index 0000000000000..d6e6b672a71d9 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_options.result @@ -0,0 +1,258 @@ +# +# === Setup: install the TIDESDB engine plugin === +# +# +# ============================================ +# TEST 1: System variables - verify defaults +# ============================================ +# +SHOW VARIABLES LIKE 'tidesdb_flush_threads'; +Variable_name Value +tidesdb_flush_threads 4 +SHOW VARIABLES LIKE 'tidesdb_compaction_threads'; +Variable_name Value +tidesdb_compaction_threads 4 +SHOW VARIABLES LIKE 'tidesdb_log_level'; +Variable_name Value +tidesdb_log_level DEBUG +SHOW VARIABLES LIKE 'tidesdb_block_cache_size'; +Variable_name Value +tidesdb_block_cache_size 268435456 +SHOW VARIABLES LIKE 'tidesdb_max_open_sstables'; +Variable_name Value +tidesdb_max_open_sstables 256 +SHOW VARIABLES LIKE 'tidesdb_max_memory_usage'; +Variable_name Value +tidesdb_max_memory_usage 0 +# +# ============================================ +# TEST 2: CREATE TABLE with default options +# ============================================ +# +CREATE TABLE t_defaults (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +SHOW CREATE TABLE t_defaults; +Table Create Table +t_defaults CREATE TABLE `t_defaults` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +INSERT INTO t_defaults VALUES (1, 'default_opts'); +SELECT * FROM t_defaults; +id val +1 default_opts +DROP TABLE t_defaults; +# +# ============================================ +# TEST 3: CREATE TABLE with custom compression +# ============================================ +# +CREATE TABLE t_none (id INT, val VARCHAR(50)) ENGINE=TIDESDB COMPRESSION='NONE'; +SHOW CREATE TABLE t_none; +Table Create Table +t_none CREATE TABLE `t_none` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `COMPRESSION`='NONE' +INSERT INTO t_none VALUES (1, 'no compression'); +SELECT * FROM t_none; +id val +1 no compression +DROP TABLE t_none; +CREATE TABLE t_zstd (id INT, val VARCHAR(50)) ENGINE=TIDESDB COMPRESSION='ZSTD'; +SHOW CREATE TABLE t_zstd; +Table Create Table +t_zstd CREATE TABLE `t_zstd` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `COMPRESSION`='ZSTD' +INSERT INTO t_zstd VALUES (1, 'zstd compressed'); +SELECT * FROM t_zstd; +id val +1 zstd compressed +DROP TABLE t_zstd; +# +# ============================================ +# TEST 4: CREATE TABLE with custom bloom filter +# ============================================ +# +CREATE TABLE t_nobloom (id INT, val VARCHAR(50)) ENGINE=TIDESDB BLOOM_FILTER=0; +SHOW CREATE TABLE t_nobloom; +Table Create Table +t_nobloom CREATE TABLE `t_nobloom` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `BLOOM_FILTER`=0 +INSERT INTO t_nobloom VALUES (1, 'no bloom'); +SELECT * FROM t_nobloom; +id val +1 no bloom +DROP TABLE t_nobloom; +CREATE TABLE t_lowfpr (id INT, val VARCHAR(50)) ENGINE=TIDESDB BLOOM_FPR=10; +SHOW CREATE TABLE t_lowfpr; +Table Create Table +t_lowfpr CREATE TABLE `t_lowfpr` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `BLOOM_FPR`=10 +INSERT INTO t_lowfpr VALUES (1, 'low fpr 0.1%'); +SELECT * FROM t_lowfpr; +id val +1 low fpr 0.1% +DROP TABLE t_lowfpr; +# +# ============================================ +# TEST 5: CREATE TABLE with custom write buffer +# ============================================ +# +CREATE TABLE t_bigbuf (id INT, val VARCHAR(50)) ENGINE=TIDESDB WRITE_BUFFER_SIZE=16777216; +SHOW CREATE TABLE t_bigbuf; +Table Create Table +t_bigbuf CREATE TABLE `t_bigbuf` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `WRITE_BUFFER_SIZE`=16777216 +INSERT INTO t_bigbuf VALUES (1, '16MB write buffer'); +SELECT * FROM t_bigbuf; +id val +1 16MB write buffer +DROP TABLE t_bigbuf; +# +# ============================================ +# TEST 6: CREATE TABLE with sync mode options +# ============================================ +# +CREATE TABLE t_syncnone (id INT) ENGINE=TIDESDB SYNC_MODE='NONE'; +Warnings: +Warning 1105 [TIDESDB] Table SYNC_MODE=NONE governs SSTable file sync only. Under tidesdb_unified_memtable=ON the shared WAL is fsynced according to tidesdb_unified_memtable_sync_mode=FULL, so the table option does not change WAL durability for this table +SHOW CREATE TABLE t_syncnone; +Table Create Table +t_syncnone CREATE TABLE `t_syncnone` ( + `id` int(11) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `SYNC_MODE`='NONE' +INSERT INTO t_syncnone VALUES (1); +SELECT * FROM t_syncnone; +id +1 +DROP TABLE t_syncnone; +CREATE TABLE t_syncint (id INT) ENGINE=TIDESDB SYNC_MODE='INTERVAL' SYNC_INTERVAL_US=500000; +Warnings: +Warning 1105 [TIDESDB] Table SYNC_MODE=INTERVAL governs SSTable file sync only. Under tidesdb_unified_memtable=ON the shared WAL is fsynced according to tidesdb_unified_memtable_sync_mode=FULL, so the table option does not change WAL durability for this table +SHOW CREATE TABLE t_syncint; +Table Create Table +t_syncint CREATE TABLE `t_syncint` ( + `id` int(11) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `SYNC_MODE`='INTERVAL' `SYNC_INTERVAL_US`=500000 +INSERT INTO t_syncint VALUES (1); +SELECT * FROM t_syncint; +id +1 +DROP TABLE t_syncint; +# +# ============================================ +# TEST 7: CREATE TABLE with isolation level +# ============================================ +# +CREATE TABLE t_rc (id INT, val VARCHAR(50)) ENGINE=TIDESDB ISOLATION_LEVEL='READ_COMMITTED'; +SHOW CREATE TABLE t_rc; +Table Create Table +t_rc CREATE TABLE `t_rc` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `ISOLATION_LEVEL`='READ_COMMITTED' +INSERT INTO t_rc VALUES (1, 'read committed'); +SELECT * FROM t_rc; +id val +1 read committed +DROP TABLE t_rc; +CREATE TABLE t_ser (id INT, val VARCHAR(50)) ENGINE=TIDESDB ISOLATION_LEVEL='SERIALIZABLE'; +SHOW CREATE TABLE t_ser; +Table Create Table +t_ser CREATE TABLE `t_ser` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `ISOLATION_LEVEL`='SERIALIZABLE' +INSERT INTO t_ser VALUES (1, 'serializable'); +SELECT * FROM t_ser; +id val +1 serializable +DROP TABLE t_ser; +# +# ============================================ +# TEST 8: CREATE TABLE with B+tree format +# ============================================ +# +CREATE TABLE t_btree (id INT, val VARCHAR(50)) ENGINE=TIDESDB USE_BTREE=1; +SHOW CREATE TABLE t_btree; +Table Create Table +t_btree CREATE TABLE `t_btree` ( + `id` int(11) DEFAULT NULL, + `val` varchar(50) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `USE_BTREE`=1 +INSERT INTO t_btree VALUES (1, 'btree format'); +SELECT * FROM t_btree; +id val +1 btree format +DROP TABLE t_btree; +# +# ============================================ +# TEST 9: CREATE TABLE with multiple options +# ============================================ +# +CREATE TABLE t_multi ( +id INT, +val VARCHAR(100) +) ENGINE=TIDESDB +COMPRESSION='ZSTD' + WRITE_BUFFER_SIZE=8388608 +BLOOM_FILTER=1 +BLOOM_FPR=50 +BLOCK_INDEXES=1 +SYNC_MODE='FULL' + ISOLATION_LEVEL='REPEATABLE_READ' + LEVEL_SIZE_RATIO=8 +MIN_LEVELS=3 +SKIP_LIST_MAX_LEVEL=16 +SKIP_LIST_PROBABILITY=50; +SHOW CREATE TABLE t_multi; +Table Create Table +t_multi CREATE TABLE `t_multi` ( + `id` int(11) DEFAULT NULL, + `val` varchar(100) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `COMPRESSION`='ZSTD' `WRITE_BUFFER_SIZE`=8388608 `BLOOM_FILTER`=1 `BLOOM_FPR`=50 `BLOCK_INDEXES`=1 `SYNC_MODE`='FULL' `ISOLATION_LEVEL`='REPEATABLE_READ' `LEVEL_SIZE_RATIO`=8 `MIN_LEVELS`=3 `SKIP_LIST_MAX_LEVEL`=16 `SKIP_LIST_PROBABILITY`=50 +INSERT INTO t_multi VALUES (1, 'multi-option table'); +INSERT INTO t_multi VALUES (2, 'second row'); +SELECT * FROM t_multi; +id val +1 multi-option table +2 second row +UPDATE t_multi SET val = 'updated' WHERE id = 1; +SELECT * FROM t_multi; +id val +1 updated +2 second row +DELETE FROM t_multi WHERE id = 2; +SELECT * FROM t_multi; +id val +1 updated +DROP TABLE t_multi; +# +# ============================================ +# TEST 10: Default isolation is REPEATABLE_READ +# ============================================ +# +CREATE TABLE t_default_iso (id INT) ENGINE=TIDESDB; +SHOW CREATE TABLE t_default_iso; +Table Create Table +t_default_iso CREATE TABLE `t_default_iso` ( + `id` int(11) DEFAULT NULL +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +INSERT INTO t_default_iso VALUES (1), (2), (3); +SELECT * FROM t_default_iso; +id +1 +2 +3 +DROP TABLE t_default_iso; +# +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_partition.result b/mysql-test/suite/tidesdb/r/tidesdb_partition.result new file mode 100644 index 0000000000000..a2e7130dc03fb --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_partition.result @@ -0,0 +1,301 @@ +# +# ============================================ +# TEST 1: HASH partitioning +# ============================================ +# +CREATE TABLE t_hash ( +id INT NOT NULL, +val VARCHAR(50), +PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY HASH(id) PARTITIONS 4; +INSERT INTO t_hash VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e'),(6,'f'),(7,'g'),(8,'h'); +SELECT * FROM t_hash ORDER BY id; +id val +1 a +2 b +3 c +4 d +5 e +6 f +7 g +8 h +SELECT COUNT(*) AS total FROM t_hash; +total +8 +# Update across potential partition boundary +UPDATE t_hash SET val = 'updated' WHERE id = 3; +SELECT * FROM t_hash WHERE id = 3; +id val +3 updated +# Delete +DELETE FROM t_hash WHERE id IN (2, 5); +SELECT * FROM t_hash ORDER BY id; +id val +1 a +3 updated +4 d +6 f +7 g +8 h +DROP TABLE t_hash; +# +# ============================================ +# TEST 2: KEY partitioning +# ============================================ +# +CREATE TABLE t_key ( +id INT NOT NULL, +name VARCHAR(50), +PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY KEY(id) PARTITIONS 3; +INSERT INTO t_key VALUES (1,'alice'),(2,'bob'),(3,'charlie'),(4,'dave'),(5,'eve'),(6,'frank'); +SELECT * FROM t_key ORDER BY id; +id name +1 alice +2 bob +3 charlie +4 dave +5 eve +6 frank +DELETE FROM t_key WHERE id = 4; +SELECT * FROM t_key ORDER BY id; +id name +1 alice +2 bob +3 charlie +5 eve +6 frank +DROP TABLE t_key; +# +# ============================================ +# TEST 3: RANGE partitioning +# ============================================ +# +CREATE TABLE t_range ( +id INT NOT NULL, +val VARCHAR(50), +PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY RANGE(id) ( +PARTITION p0 VALUES LESS THAN (10), +PARTITION p1 VALUES LESS THAN (20), +PARTITION p2 VALUES LESS THAN (30), +PARTITION p3 VALUES LESS THAN MAXVALUE +); +INSERT INTO t_range VALUES (1,'r0'),(5,'r0'),(9,'r0'); +INSERT INTO t_range VALUES (10,'r1'),(15,'r1'),(19,'r1'); +INSERT INTO t_range VALUES (20,'r2'),(25,'r2'); +INSERT INTO t_range VALUES (30,'r3'),(50,'r3'),(100,'r3'); +SELECT * FROM t_range ORDER BY id; +id val +1 r0 +5 r0 +9 r0 +10 r1 +15 r1 +19 r1 +20 r2 +25 r2 +30 r3 +50 r3 +100 r3 +SELECT COUNT(*) AS total FROM t_range; +total +11 +# Query that should hit only partition p1 +SELECT * FROM t_range WHERE id >= 10 AND id < 20 ORDER BY id; +id val +10 r1 +15 r1 +19 r1 +# Delete from specific range +DELETE FROM t_range WHERE id >= 20 AND id < 30; +SELECT * FROM t_range ORDER BY id; +id val +1 r0 +5 r0 +9 r0 +10 r1 +15 r1 +19 r1 +30 r3 +50 r3 +100 r3 +# Update across range boundary +UPDATE t_range SET val = 'moved' WHERE id = 5; +SELECT * FROM t_range WHERE id = 5; +id val +5 moved +DROP TABLE t_range; +# +# ============================================ +# TEST 4: LIST partitioning +# ============================================ +# +CREATE TABLE t_list ( +id INT NOT NULL, +region INT NOT NULL, +name VARCHAR(50), +PRIMARY KEY (id, region) +) ENGINE=TIDESDB +PARTITION BY LIST(region) ( +PARTITION p_east VALUES IN (1, 2, 3), +PARTITION p_west VALUES IN (4, 5, 6), +PARTITION p_central VALUES IN (7, 8, 9) +); +INSERT INTO t_list VALUES (1,1,'NY'),(2,2,'NJ'),(3,3,'CT'); +INSERT INTO t_list VALUES (4,4,'CA'),(5,5,'OR'),(6,6,'WA'); +INSERT INTO t_list VALUES (7,7,'IL'),(8,8,'OH'),(9,9,'MI'); +SELECT * FROM t_list ORDER BY id; +id region name +1 1 NY +2 2 NJ +3 3 CT +4 4 CA +5 5 OR +6 6 WA +7 7 IL +8 8 OH +9 9 MI +# Query specific list partition +SELECT * FROM t_list WHERE region IN (4,5,6) ORDER BY id; +id region name +4 4 CA +5 5 OR +6 6 WA +DELETE FROM t_list WHERE region = 8; +SELECT * FROM t_list ORDER BY id; +id region name +1 1 NY +2 2 NJ +3 3 CT +4 4 CA +5 5 OR +6 6 WA +7 7 IL +9 9 MI +DROP TABLE t_list; +# +# ============================================ +# TEST 5: RANGE COLUMNS partitioning +# ============================================ +# +CREATE TABLE t_range_col ( +id INT NOT NULL, +created DATE NOT NULL, +val VARCHAR(50), +PRIMARY KEY (id, created) +) ENGINE=TIDESDB +PARTITION BY RANGE COLUMNS(created) ( +PARTITION p_2024 VALUES LESS THAN ('2025-01-01'), +PARTITION p_2025 VALUES LESS THAN ('2026-01-01'), +PARTITION p_future VALUES LESS THAN MAXVALUE +); +INSERT INTO t_range_col VALUES (1,'2024-06-15','old'),(2,'2024-12-31','old'); +INSERT INTO t_range_col VALUES (3,'2025-03-10','current'),(4,'2025-11-20','current'); +INSERT INTO t_range_col VALUES (5,'2026-05-01','future'); +SELECT * FROM t_range_col ORDER BY created; +id created val +1 2024-06-15 old +2 2024-12-31 old +3 2025-03-10 current +4 2025-11-20 current +5 2026-05-01 future +# Query specific partition by date range +SELECT * FROM t_range_col WHERE created >= '2025-01-01' AND created < '2026-01-01' ORDER BY id; +id created val +3 2025-03-10 current +4 2025-11-20 current +DROP TABLE t_range_col; +# +# ============================================ +# TEST 6: Partition with secondary index +# ============================================ +# +CREATE TABLE t_part_idx ( +id INT NOT NULL, +category INT, +name VARCHAR(50), +PRIMARY KEY (id), +KEY idx_cat (category) +) ENGINE=TIDESDB +PARTITION BY HASH(id) PARTITIONS 3; +INSERT INTO t_part_idx VALUES (1,10,'a'),(2,20,'b'),(3,10,'c'),(4,30,'d'),(5,20,'e'),(6,10,'f'); +# Scan via secondary index across partitions +SELECT * FROM t_part_idx WHERE category = 10 ORDER BY id; +id category name +1 10 a +3 10 c +6 10 f +SELECT * FROM t_part_idx WHERE category = 20 ORDER BY id; +id category name +2 20 b +5 20 e +DROP TABLE t_part_idx; +# +# ============================================ +# TEST 7: ALTER TABLE add/drop partition (RANGE) +# ============================================ +# +CREATE TABLE t_alter_part ( +id INT NOT NULL, +val VARCHAR(50), +PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY RANGE(id) ( +PARTITION p0 VALUES LESS THAN (100), +PARTITION p1 VALUES LESS THAN (200) +); +INSERT INTO t_alter_part VALUES (1,'lo'),(50,'lo'),(100,'hi'),(150,'hi'); +SELECT * FROM t_alter_part ORDER BY id; +id val +1 lo +50 lo +100 hi +150 hi +# Add a new partition +ALTER TABLE t_alter_part ADD PARTITION (PARTITION p2 VALUES LESS THAN MAXVALUE); +INSERT INTO t_alter_part VALUES (200,'new'),(300,'new'); +SELECT * FROM t_alter_part ORDER BY id; +id val +1 lo +50 lo +100 hi +150 hi +200 new +300 new +# Drop a partition (removes data in that range) +ALTER TABLE t_alter_part DROP PARTITION p1; +SELECT * FROM t_alter_part ORDER BY id; +id val +1 lo +50 lo +200 new +300 new +DROP TABLE t_alter_part; +# +# ============================================ +# TEST 8: SHOW CREATE TABLE with partitions +# ============================================ +# +CREATE TABLE t_show_part ( +id INT NOT NULL, +val VARCHAR(50), +PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY HASH(id) PARTITIONS 2; +SHOW CREATE TABLE t_show_part; +Table Create Table +t_show_part CREATE TABLE `t_show_part` ( + `id` int(11) NOT NULL, + `val` varchar(50) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci + PARTITION BY HASH (`id`) +PARTITIONS 2 +DROP TABLE t_show_part; +# +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_per_index_btree.result b/mysql-test/suite/tidesdb/r/tidesdb_per_index_btree.result new file mode 100644 index 0000000000000..3400c01df39f6 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_per_index_btree.result @@ -0,0 +1,42 @@ +# +# TEST 1: Per-index USE_BTREE on secondary index +# +CREATE TABLE t1 ( +id INT NOT NULL PRIMARY KEY, +a INT, +b INT, +KEY idx_a (a) USE_BTREE=1, +KEY idx_b (b) +) ENGINE=TidesDB; +INSERT INTO t1 VALUES (1,10,100),(2,20,200),(3,30,300); +# idx_a should show BTREE, idx_b should show LSM +SHOW KEYS FROM t1; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t1 0 PRIMARY 1 id A 2 NULL NULL LSM NO +t1 1 idx_a 1 a A 2 NULL NULL YES BTREE NO +t1 1 idx_b 1 b A 2 NULL NULL YES LSM NO +SELECT * FROM t1 WHERE a = 20; +id a b +2 20 200 +SELECT * FROM t1 WHERE b = 200; +id a b +2 20 200 +DROP TABLE t1; +# +# TEST 2: Table-level USE_BTREE=1 with per-index override +# +CREATE TABLE t2 ( +id INT NOT NULL PRIMARY KEY, +x INT, +KEY idx_x (x) USE_BTREE=0 +) ENGINE=TidesDB USE_BTREE=1; +# PK and idx_x should both show BTREE (table default), but idx_x USE_BTREE=0 +# Note: per-index USE_BTREE=0 does NOT override table-level to LSM -- it just +# means the index itself didn't request BTREE; the table default still applies. +SHOW KEYS FROM t2; +Table Non_unique Key_name Seq_in_index Column_name Collation Cardinality Sub_part Packed Null Index_type Comment Index_comment Ignored +t2 0 PRIMARY 1 id A 2 NULL NULL BTREE NO +t2 1 idx_x 1 x A 2 NULL NULL YES BTREE NO +DROP TABLE t2; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_chain_bounded.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_chain_bounded.result new file mode 100644 index 0000000000000..7720f7fbabb83 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_chain_bounded.result @@ -0,0 +1,26 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE churn ( +id INT NOT NULL PRIMARY KEY, +val INT +) ENGINE=TidesDB; +connect conA, localhost, root,,; +connect conB, localhost, root,,; +# +# Each session churns 2500 unique PKs in batches of 50. Every +# batch commits, releasing all its row locks; the next batch +# acquires fresh locks that should land on freelisted slots. +# +connection default; +SELECT @verdict; +@verdict +CHAIN_BOUNDED +SELECT COUNT(*) FROM churn; +COUNT(*) +0 +SELECT @recycled_some; +@recycled_some +RECYCLED +disconnect conA; +disconnect conB; +DROP TABLE churn; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_deadlock_cycle.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_deadlock_cycle.result new file mode 100644 index 0000000000000..d38908e4278c1 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_deadlock_cycle.result @@ -0,0 +1,35 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE c ( +id INT PRIMARY KEY, +v INT NOT NULL +) ENGINE=TidesDB; +INSERT INTO c VALUES (1, 10), (2, 20); +connect a, localhost, root,,; +connect b, localhost, root,,; +connection a; +BEGIN; +UPDATE c SET v = v + 1 WHERE id = 1; +connection b; +BEGIN; +UPDATE c SET v = v + 1 WHERE id = 2; +connection default; +connection a; +UPDATE c SET v = v + 1 WHERE id = 2; +connection default; +connection b; +UPDATE c SET v = v + 1 WHERE id = 1; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +ROLLBACK; +connection a; +COMMIT; +connection default; +# Row 1 incremented by T1 only (T2 aborted); row 2 incremented by T1 only. +SELECT * FROM c ORDER BY id; +id v +1 11 +2 21 +disconnect a; +disconnect b; +connection default; +DROP TABLE c; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_forupdate.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_forupdate.result new file mode 100644 index 0000000000000..0f56391b1d50f --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_forupdate.result @@ -0,0 +1,78 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +# +# Setup: TPC-C district-like table +# +CREATE TABLE district ( +d_w_id INT NOT NULL, +d_id INT NOT NULL, +d_next_o_id INT NOT NULL, +d_tax DECIMAL(4,4), +PRIMARY KEY (d_w_id, d_id) +) ENGINE=TidesDB; +INSERT INTO district VALUES (1, 1, 3001, 0.1000); +# +# TEST 1: Two concurrent SELECT FOR UPDATE + UPDATE +# on the same row. Both should succeed with pessimistic +# locking serializing access. Counter = 3001 + 2 = 3003 +# +connect conA, localhost, root,,; +connect conB, localhost, root,,; +connection conA; +BEGIN; +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1 FOR UPDATE; +d_next_o_id +3001 +connection conB; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection conA; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +COMMIT; +connection conB; +connection default; +# Both succeeded: 3001 + 1 (conA) + 1 (conB) = 3003 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +3003 +# +# TEST 2: Stored procedure with SELECT FOR UPDATE +# Mimics TPC-C NEWORD pattern inside a CALL +# +CREATE PROCEDURE neword_mini(IN p_w_id INT, IN p_d_id INT) +BEGIN +DECLARE v_next_o_id INT; +SELECT d_next_o_id INTO v_next_o_id +FROM district WHERE d_w_id = p_w_id AND d_id = p_d_id FOR UPDATE; +UPDATE district SET d_next_o_id = v_next_o_id + 1 +WHERE d_w_id = p_w_id AND d_id = p_d_id; +END| +UPDATE district SET d_next_o_id = 5001 WHERE d_w_id=1 AND d_id=1; +connection conA; +BEGIN; +CALL neword_mini(1, 1); +connection conB; +CALL neword_mini(1, 1); +connection conA; +COMMIT; +connection conB; +connection default; +# Both CALL succeeded: 5001 + 1 + 1 = 5003 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +5003 +# +# TEST 3: Serial counter increment (10 iterations) +# +UPDATE district SET d_next_o_id = 6001 WHERE d_w_id=1 AND d_id=1; +# Should be 6001 + 10 = 6011 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +6011 +# +# Cleanup +# +disconnect conA; +disconnect conB; +connection default; +DROP PROCEDURE neword_mini; +DROP TABLE district; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_insert_lock.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_insert_lock.result new file mode 100644 index 0000000000000..f43d8274cc63e --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_insert_lock.result @@ -0,0 +1,181 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +# +# Setup +# +CREATE TABLE t ( +i INT, +PRIMARY KEY (i) +) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); +connect conA, localhost, root,,; +connect conB, localhost, root,,; +# +# TEST 1: SELECT FOR UPDATE on non-existing row blocks DELETE +# Connection A locks i=15 (does not exist). +# Connection B deletes i=2 (succeeds immediately), +# then tries to delete i=15 (must block). +# +connection conA; +BEGIN; +SELECT * FROM t WHERE i = 15 FOR UPDATE; +i +connection conB; +DELETE FROM t WHERE i = 2; +DELETE FROM t WHERE i = 15; +connection conA; +COMMIT; +connection conB; +connection default; +# i=2 and i=15 both deleted (i=15 was no-op but lock was respected) +SELECT * FROM t ORDER BY i; +i +1 +3 +4 +5 +# +# TEST 2: DELETE acquires a lock that blocks another DELETE +# Connection A deletes i=3 inside a transaction. +# Connection B deletes i=4 (succeeds immediately), +# then tries to delete i=3 (must block until A commits). +# +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); +connection conA; +BEGIN; +DELETE FROM t WHERE i = 3; +connection conB; +DELETE FROM t WHERE i = 4; +DELETE FROM t WHERE i = 3; +connection conA; +COMMIT; +connection conB; +connection default; +# i=3 and i=4 both deleted +SELECT * FROM t ORDER BY i; +i +1 +2 +5 +# +# TEST 3: UPDATE acquires a lock that blocks another UPDATE +# +DROP TABLE t; +CREATE TABLE t (i INT, v INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1, 10), (2, 20), (3, 30); +connection conA; +BEGIN; +UPDATE t SET v = 99 WHERE i = 3; +connection conB; +UPDATE t SET v = 88 WHERE i = 2; +UPDATE t SET v = 77 WHERE i = 3; +connection conA; +COMMIT; +connection conB; +connection default; +# conA set v=99, then conB overwrote with v=77 +SELECT * FROM t ORDER BY i; +i v +1 10 +2 88 +3 77 +# +# TEST 4: INSERT blocked by SELECT FOR UPDATE on non-existing key +# This is the critical fix -- previously INSERT bypassed the lock. +# Connection A does SELECT FOR UPDATE on i=15 (non-existing). +# Connection B tries INSERT i=15 (must block until A commits). +# +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); +connection conA; +BEGIN; +SELECT * FROM t WHERE i = 15 FOR UPDATE; +i +connection conB; +INSERT INTO t VALUES (15); +connection conA; +COMMIT; +connection conB; +connection default; +# i=15 now exists (inserted by conB after conA released the lock) +SELECT * FROM t WHERE i >= 10 ORDER BY i; +i +15 +# +# TEST 5: INSERT blocked by DELETE on existing row +# Connection A deletes i=3 inside a transaction. +# Connection B tries to INSERT i=3 (must block). +# +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); +connection conA; +BEGIN; +DELETE FROM t WHERE i = 3; +connection conB; +INSERT INTO t VALUES (3); +connection conA; +COMMIT; +connection conB; +connection default; +# i=3 was deleted by conA, then re-inserted by conB +SELECT * FROM t ORDER BY i; +i +1 +2 +3 +4 +5 +# +# TEST 6: Concurrent INSERTs on different keys do not block +# +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +connection conA; +BEGIN; +INSERT INTO t VALUES (100); +connection conB; +INSERT INTO t VALUES (200); +connection conA; +COMMIT; +connection default; +# Both inserts succeeded without blocking +SELECT * FROM t ORDER BY i; +i +100 +200 +# +# TEST 7: Autocommit UPDATE blocked by SELECT FOR UPDATE +# +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); +connection conA; +BEGIN; +SELECT * FROM t WHERE i = 3 FOR UPDATE; +i +3 +connection conB; +UPDATE t SET i = 33 WHERE i = 3; +connection conA; +COMMIT; +connection conB; +connection default; +# conA released lock, then conB's autocommit UPDATE renamed i=3 to i=33 +SELECT * FROM t ORDER BY i; +i +1 +2 +4 +5 +33 +# +# Cleanup +# +disconnect conA; +disconnect conB; +connection default; +DROP TABLE t; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_killwait.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_killwait.result new file mode 100644 index 0000000000000..16cb2d2c7a3d7 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_killwait.result @@ -0,0 +1,33 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE k ( +id INT PRIMARY KEY, +v INT NOT NULL +) ENGINE=TidesDB; +INSERT INTO k VALUES (1, 100); +connect a, localhost, root,,; +connect b, localhost, root,,; +connect killer, localhost, root,,; +connection a; +BEGIN; +UPDATE k SET v = v + 1 WHERE id = 1; +connection b; +BEGIN; +UPDATE k SET v = v + 1 WHERE id = 1; +connection killer; +# KILL QUERY issued against the blocked UPDATE on connection b. +connection b; +Got one of the listed errors +ROLLBACK; +connection a; +COMMIT; +connection default; +# Row 1 incremented by T1 only. +SELECT * FROM k WHERE id = 1; +id v +1 101 +disconnect a; +disconnect b; +disconnect killer; +connection default; +DROP TABLE k; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_reentry.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_reentry.result new file mode 100644 index 0000000000000..9562d34e557f2 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_reentry.result @@ -0,0 +1,30 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE r ( +id INT PRIMARY KEY, +v INT NOT NULL +) ENGINE=TidesDB; +INSERT INTO r VALUES (1, 100); +connect a, localhost, root,,; +connection a; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM r WHERE id = 1 FOR UPDATE; +v +100 +SELECT v FROM r WHERE id = 1 FOR UPDATE; +v +100 +UPDATE r SET v = v + 1 WHERE id = 1; +SELECT v FROM r WHERE id = 1; +v +101 +COMMIT; +connection default; +# Row 1 incremented exactly once. +SELECT * FROM r WHERE id = 1; +id v +1 101 +disconnect a; +connection default; +DROP TABLE r; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_shared.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_shared.result new file mode 100644 index 0000000000000..c35c620565246 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_shared.result @@ -0,0 +1,130 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE acct ( +id INT PRIMARY KEY, +bal INT NOT NULL +) ENGINE=TidesDB; +INSERT INTO acct VALUES (1, 100); +connect s1, localhost, root,,; +connect s2, localhost, root,,; +connect s3, localhost, root,,; +# +# TEST 1: S / S compatible under REPEATABLE-READ +# Both s1 and s2 acquire S on the same row, neither blocks. +# +connection s1; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +bal +100 +connection s2; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +bal +100 +# Both holders of S read successfully -- no deadlock, no block. +connection default; +SELECT bal FROM acct WHERE id = 1; +bal +100 +connection s1; +COMMIT; +connection s2; +COMMIT; +# +# TEST 2: X waits for S readers, then proceeds +# s1 + s2 hold S; s3 fires UPDATE that must wait until +# both readers release. +# +connection s1; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +bal +100 +connection s2; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +bal +100 +connection s3; +BEGIN; +UPDATE acct SET bal = bal + 50 WHERE id = 1; +connection s1; +COMMIT; +connection s2; +COMMIT; +connection s3; +COMMIT; +connection default; +# 100 + 50 = 150 +SELECT bal FROM acct WHERE id = 1; +bal +150 +# +# TEST 3: writer fairness -- new S blocks behind a waiting X +# s1 holds S; s2 fires UPDATE (X-waiting); s3 fires a +# SELECT under REPEATABLE-READ that wants S. s3 must +# NOT jump ahead of s2's queued X. +# +UPDATE acct SET bal = 200 WHERE id = 1; +connection s1; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +bal +200 +connection default; +connection s2; +BEGIN; +UPDATE acct SET bal = bal + 1 WHERE id = 1; +connection default; +connection s3; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +connection s1; +COMMIT; +connection s2; +COMMIT; +connection s3; +COMMIT; +connection default; +# s2 incremented 200 -> 201; s3 then read either 200 or 201 (both valid) +SELECT bal FROM acct WHERE id = 1; +bal +201 +# +# TEST 4: READ-COMMITTED reads take no lock +# s1 holds an uncommitted X via UPDATE; s2 under RC reads +# the latest committed value without blocking. +# +UPDATE acct SET bal = 300 WHERE id = 1; +connection s1; +BEGIN; +UPDATE acct SET bal = bal + 100 WHERE id = 1; +connection s2; +SET SESSION transaction_isolation = 'READ-COMMITTED'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +bal +300 +COMMIT; +connection s1; +COMMIT; +connection default; +# 300 + 100 = 400 +SELECT bal FROM acct WHERE id = 1; +bal +400 +# +# Cleanup +# +disconnect s1; +disconnect s2; +disconnect s3; +connection default; +DROP TABLE acct; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_timeout.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_timeout.result new file mode 100644 index 0000000000000..4b2e2b6962d2b --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_timeout.result @@ -0,0 +1,32 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE w ( +id INT PRIMARY KEY, +v INT NOT NULL +) ENGINE=TidesDB; +INSERT INTO w VALUES (1, 100); +connect a, localhost, root,,; +connect b, localhost, root,,; +connection default; +connection a; +BEGIN; +UPDATE w SET v = v + 1 WHERE id = 1; +connection b; +SET SESSION tidesdb_lock_wait_timeout_ms = 300; +BEGIN; +UPDATE w SET v = v + 1 WHERE id = 1; +ERROR HY000: Lock wait timeout exceeded; try restarting transaction +ROLLBACK; +connection a; +COMMIT; +connection default; +timeout_delta +1 +# Row 1 incremented by T1 only. +SELECT * FROM w WHERE id = 1; +id v +1 101 +disconnect a; +disconnect b; +connection default; +DROP TABLE w; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_upgrade.result b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_upgrade.result new file mode 100644 index 0000000000000..e68977ceafb94 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pessimistic_upgrade.result @@ -0,0 +1,54 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +CREATE TABLE u ( +id INT PRIMARY KEY, +v INT NOT NULL +) ENGINE=TidesDB; +INSERT INTO u VALUES (1, 100); +connect a, localhost, root,,; +connect b, localhost, root,,; +# +# Scenario 1, sole holder upgrades cleanly. +# +connection a; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM u WHERE id = 1; +v +100 +UPDATE u SET v = v + 10 WHERE id = 1; +COMMIT; +connection default; +SELECT * FROM u WHERE id = 1; +id v +1 110 +# +# Scenario 2, two S holders, one tries to upgrade, must be rejected. +# +connection a; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM u WHERE id = 1; +v +110 +connection b; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM u WHERE id = 1; +v +110 +connection a; +UPDATE u SET v = v + 1 WHERE id = 1; +ERROR 40001: Deadlock found when trying to get lock; try restarting transaction +ROLLBACK; +connection b; +COMMIT; +connection default; +# Row 1 unchanged from scenario 2. +SELECT * FROM u WHERE id = 1; +id v +1 110 +disconnect a; +disconnect b; +connection default; +DROP TABLE u; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_pk_index.result b/mysql-test/suite/tidesdb/r/tidesdb_pk_index.result new file mode 100644 index 0000000000000..1d631f4a4c633 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_pk_index.result @@ -0,0 +1,192 @@ +DROP TABLE IF EXISTS t_pk, t_autoinc, t_secidx, t_combined; +# +# ============================================ +# TEST 1: PRIMARY KEY - point lookups & range +# ============================================ +# +CREATE TABLE t_pk ( +id INT NOT NULL PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB; +INSERT INTO t_pk VALUES (10, 'ten'), (20, 'twenty'), (30, 'thirty'); +# Point lookup by PK +SELECT * FROM t_pk WHERE id = 20; +id val +20 twenty +# Range scan on PK +SELECT * FROM t_pk WHERE id >= 15 AND id <= 25; +id val +20 twenty +# Full scan (should still work) +SELECT * FROM t_pk ORDER BY id; +id val +10 ten +20 twenty +30 thirty +# UPDATE via PK lookup +UPDATE t_pk SET val = 'TWO-ZERO' WHERE id = 20; +SELECT * FROM t_pk WHERE id = 20; +id val +20 TWO-ZERO +# DELETE via PK lookup +DELETE FROM t_pk WHERE id = 10; +SELECT * FROM t_pk ORDER BY id; +id val +20 TWO-ZERO +30 thirty +DROP TABLE t_pk; +# +# ============================================ +# TEST 2: AUTO_INCREMENT +# ============================================ +# +CREATE TABLE t_autoinc ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +name VARCHAR(50) +) ENGINE=TIDESDB; +INSERT INTO t_autoinc (name) VALUES ('alice'); +INSERT INTO t_autoinc (name) VALUES ('bob'); +INSERT INTO t_autoinc (name) VALUES ('carol'); +SELECT * FROM t_autoinc ORDER BY id; +id name +1 alice +2 bob +3 carol +# Explicit id should also work +INSERT INTO t_autoinc (id, name) VALUES (100, 'dave'); +SELECT * FROM t_autoinc WHERE id = 100; +id name +100 dave +# Next auto-inc should continue past 100 +INSERT INTO t_autoinc (name) VALUES ('eve'); +SELECT * FROM t_autoinc ORDER BY id; +id name +1 alice +2 bob +3 carol +100 dave +101 eve +DROP TABLE t_autoinc; +# +# ============================================ +# TEST 3: Secondary index (KEY) +# ============================================ +# +CREATE TABLE t_secidx ( +id INT NOT NULL PRIMARY KEY, +k INT NOT NULL, +val VARCHAR(50), +KEY k_idx (k) +) ENGINE=TIDESDB; +INSERT INTO t_secidx VALUES (1, 100, 'a'), (2, 200, 'b'), (3, 100, 'c'), (4, 300, 'd'); +# Lookup via secondary index +SELECT * FROM t_secidx WHERE k = 100 ORDER BY id; +id k val +1 100 a +3 100 c +SELECT * FROM t_secidx WHERE k = 200; +id k val +2 200 b +# Range on secondary index +SELECT * FROM t_secidx WHERE k >= 200 ORDER BY k; +id k val +2 200 b +4 300 d +# UPDATE a row and verify secondary index is maintained +UPDATE t_secidx SET k = 999 WHERE id = 2; +SELECT * FROM t_secidx WHERE k = 200; +id k val +SELECT * FROM t_secidx WHERE k = 999; +id k val +2 999 b +# DELETE and verify index entry removed +DELETE FROM t_secidx WHERE id = 3; +SELECT * FROM t_secidx WHERE k = 100 ORDER BY id; +id k val +1 100 a +DROP TABLE t_secidx; +# +# ============================================ +# TEST 4: Combined PK + AUTO_INCREMENT + secondary index +# (sysbench-like schema) +# ============================================ +# +CREATE TABLE t_combined ( +id INT NOT NULL AUTO_INCREMENT, +k INT NOT NULL DEFAULT 0, +c CHAR(120) NOT NULL DEFAULT '', +pad CHAR(60) NOT NULL DEFAULT '', +PRIMARY KEY (id), +KEY k_1 (k) +) ENGINE=TIDESDB; +# Insert rows (sysbench-style) +INSERT INTO t_combined (k, c, pad) VALUES +(1, REPEAT('a', 120), REPEAT('x', 60)), +(2, REPEAT('b', 120), REPEAT('y', 60)), +(3, REPEAT('c', 120), REPEAT('z', 60)), +(1, REPEAT('d', 120), REPEAT('w', 60)); +SELECT id, k, LENGTH(c) AS c_len, LENGTH(pad) AS pad_len FROM t_combined ORDER BY id; +id k c_len pad_len +1 1 120 60 +2 2 120 60 +3 3 120 60 +4 1 120 60 +# Point select by PK (sysbench oltp_point_select) +SELECT id, k FROM t_combined WHERE id = 2; +id k +2 2 +# Range select by PK +SELECT id, k FROM t_combined WHERE id BETWEEN 2 AND 3 ORDER BY id; +id k +2 2 +3 3 +# Lookup via secondary index +SELECT id, k FROM t_combined WHERE k = 1 ORDER BY id; +id k +1 1 +4 1 +# Update indexed column (sysbench oltp_update_index) +UPDATE t_combined SET k = k + 1 WHERE id = 1; +SELECT id, k FROM t_combined WHERE id = 1; +id k +1 2 +# Verify old index entry gone, new one present +SELECT id, k FROM t_combined WHERE k = 1 ORDER BY id; +id k +4 1 +SELECT id, k FROM t_combined WHERE k = 2 ORDER BY id; +id k +1 2 +2 2 +# Delete +DELETE FROM t_combined WHERE id = 3; +SELECT COUNT(*) AS cnt FROM t_combined; +cnt +3 +# TRUNCATE +TRUNCATE TABLE t_combined; +SELECT COUNT(*) AS cnt FROM t_combined; +cnt +0 +DROP TABLE t_combined; +# +# ============================================ +# TEST 5: BIGINT PRIMARY KEY +# ============================================ +# +CREATE TABLE t_bigpk ( +id BIGINT NOT NULL PRIMARY KEY, +val VARCHAR(20) +) ENGINE=TIDESDB; +INSERT INTO t_bigpk VALUES (9223372036854775806, 'near_max'); +INSERT INTO t_bigpk VALUES (1, 'one'); +INSERT INTO t_bigpk VALUES (9223372036854775807, 'max'); +SELECT * FROM t_bigpk ORDER BY id; +id val +1 one +9223372036854775806 near_max +9223372036854775807 max +SELECT * FROM t_bigpk WHERE id = 9223372036854775807; +id val +9223372036854775807 max +DROP TABLE t_bigpk; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_rename.result b/mysql-test/suite/tidesdb/r/tidesdb_rename.result new file mode 100644 index 0000000000000..353adf3aa1305 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_rename.result @@ -0,0 +1,212 @@ +# +# === Setup: install the TIDESDB engine plugin === +# +# +# ============================================ +# TEST 1: Basic RENAME TABLE +# ============================================ +# +CREATE TABLE t_orig (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_orig VALUES (1, 'alpha'), (2, 'beta'), (3, 'gamma'); +SELECT * FROM t_orig ORDER BY id; +id val +1 alpha +2 beta +3 gamma +RENAME TABLE t_orig TO t_renamed; +SELECT * FROM t_orig; +ERROR 42S02: Table 'test.t_orig' doesn't exist +SELECT * FROM t_renamed ORDER BY id; +id val +1 alpha +2 beta +3 gamma +INSERT INTO t_renamed VALUES (4, 'delta'); +UPDATE t_renamed SET val = 'BETA' WHERE id = 2; +DELETE FROM t_renamed WHERE id = 3; +SELECT * FROM t_renamed ORDER BY id; +id val +1 alpha +2 BETA +4 delta +DROP TABLE t_renamed; +# +# ============================================ +# TEST 2: RENAME TABLE with secondary index +# ============================================ +# +CREATE TABLE t_idx ( +id INT PRIMARY KEY, +name VARCHAR(50) NOT NULL, +KEY idx_name (name) +) ENGINE=TIDESDB; +INSERT INTO t_idx VALUES (1, 'alice'), (2, 'bob'), (3, 'charlie'), (4, 'alice'); +SELECT id, name FROM t_idx WHERE name = 'alice' ORDER BY id; +id name +1 alice +4 alice +RENAME TABLE t_idx TO t_idx_new; +SELECT id, name FROM t_idx_new WHERE name = 'alice' ORDER BY id; +id name +1 alice +4 alice +SELECT id, name FROM t_idx_new WHERE name = 'bob'; +id name +2 bob +INSERT INTO t_idx_new VALUES (5, 'bob'); +SELECT id, name FROM t_idx_new WHERE name = 'bob' ORDER BY id; +id name +2 bob +5 bob +DROP TABLE t_idx_new; +# +# ============================================ +# TEST 3: ALTER TABLE changes table options +# ============================================ +# +CREATE TABLE t_alter (id INT PRIMARY KEY, val VARCHAR(100)) ENGINE=TIDESDB; +INSERT INTO t_alter VALUES (1, 'before'), (2, 'alter'), (3, 'table'); +SELECT * FROM t_alter ORDER BY id; +id val +1 before +2 alter +3 table +SHOW CREATE TABLE t_alter; +Table Create Table +t_alter CREATE TABLE `t_alter` ( + `id` int(11) NOT NULL, + `val` varchar(100) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +ALTER TABLE t_alter SYNC_MODE='NONE'; +SHOW CREATE TABLE t_alter; +Table Create Table +t_alter CREATE TABLE `t_alter` ( + `id` int(11) NOT NULL, + `val` varchar(100) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `SYNC_MODE`='NONE' +SELECT * FROM t_alter ORDER BY id; +id val +1 before +2 alter +3 table +INSERT INTO t_alter VALUES (4, 'after_alter'); +UPDATE t_alter SET val = 'ALTERED' WHERE id = 2; +DELETE FROM t_alter WHERE id = 1; +SELECT * FROM t_alter ORDER BY id; +id val +2 ALTERED +3 table +4 after_alter +DROP TABLE t_alter; +# +# ============================================ +# TEST 4: ALTER TABLE ADD COLUMN (schema change) +# ============================================ +# +CREATE TABLE t_schema (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_schema VALUES (1, 'one'), (2, 'two'); +ALTER TABLE t_schema ADD COLUMN extra INT DEFAULT 0; +SHOW CREATE TABLE t_schema; +Table Create Table +t_schema CREATE TABLE `t_schema` ( + `id` int(11) NOT NULL, + `val` varchar(50) DEFAULT NULL, + `extra` int(11) DEFAULT 0, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +SELECT * FROM t_schema ORDER BY id; +id val extra +1 one NULL +2 two NULL +INSERT INTO t_schema VALUES (3, 'three', 99); +SELECT * FROM t_schema ORDER BY id; +id val extra +1 one NULL +2 two NULL +3 three 99 +DROP TABLE t_schema; +# +# ============================================ +# TEST 5: ALTER TABLE with secondary index +# ============================================ +# +CREATE TABLE t_altidx ( +id INT PRIMARY KEY, +name VARCHAR(50) NOT NULL, +KEY idx_name (name) +) ENGINE=TIDESDB; +INSERT INTO t_altidx VALUES (1, 'alice'), (2, 'bob'), (3, 'charlie'); +SELECT id FROM t_altidx WHERE name = 'bob'; +id +2 +ALTER TABLE t_altidx SYNC_MODE='NONE'; +SELECT id FROM t_altidx WHERE name = 'bob'; +id +2 +SELECT id FROM t_altidx WHERE name = 'alice'; +id +1 +SELECT * FROM t_altidx ORDER BY id; +id name +1 alice +2 bob +3 charlie +INSERT INTO t_altidx VALUES (4, 'alice'); +SELECT id FROM t_altidx WHERE name = 'alice' ORDER BY id; +id +1 +4 +DROP TABLE t_altidx; +# +# ============================================ +# TEST 6: Double rename +# ============================================ +# +CREATE TABLE t_a (id INT PRIMARY KEY, val INT) ENGINE=TIDESDB; +INSERT INTO t_a VALUES (1, 10), (2, 20); +RENAME TABLE t_a TO t_b; +SELECT * FROM t_b ORDER BY id; +id val +1 10 +2 20 +RENAME TABLE t_b TO t_c; +SELECT * FROM t_c ORDER BY id; +id val +1 10 +2 20 +SELECT * FROM t_a; +ERROR 42S02: Table 'test.t_a' doesn't exist +SELECT * FROM t_b; +ERROR 42S02: Table 'test.t_b' doesn't exist +DROP TABLE t_c; +# +# ============================================ +# TEST 7: ALTER TABLE without explicit PK (hidden PK) +# ============================================ +# +CREATE TABLE t_nopk (val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_nopk VALUES ('row1'), ('row2'), ('row3'); +SELECT * FROM t_nopk; +val +row1 +row2 +row3 +ALTER TABLE t_nopk SYNC_MODE='NONE'; +SELECT * FROM t_nopk; +val +row1 +row2 +row3 +INSERT INTO t_nopk VALUES ('row4'); +SELECT * FROM t_nopk; +val +row1 +row2 +row3 +row4 +DROP TABLE t_nopk; +# +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_replace_iodku.result b/mysql-test/suite/tidesdb/r/tidesdb_replace_iodku.result new file mode 100644 index 0000000000000..7725c54593185 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_replace_iodku.result @@ -0,0 +1,200 @@ +# +# ============================================ +# TEST 1: REPLACE INTO - PK only table +# ============================================ +# +CREATE TABLE t_rep ( +id INT NOT NULL PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB; +INSERT INTO t_rep VALUES (1, 'one'), (2, 'two'), (3, 'three'); +SELECT * FROM t_rep ORDER BY id; +id val +1 one +2 two +3 three +# REPLACE existing row (id=2) +REPLACE INTO t_rep VALUES (2, 'TWO-replaced'); +SELECT * FROM t_rep ORDER BY id; +id val +1 one +2 TWO-replaced +3 three +# REPLACE non-existing row (id=4) +REPLACE INTO t_rep VALUES (4, 'four-new'); +SELECT * FROM t_rep ORDER BY id; +id val +1 one +2 TWO-replaced +3 three +4 four-new +# REPLACE multiple rows at once +REPLACE INTO t_rep VALUES (1, 'ONE-replaced'), (3, 'THREE-replaced'), (5, 'five-new'); +SELECT * FROM t_rep ORDER BY id; +id val +1 ONE-replaced +2 TWO-replaced +3 THREE-replaced +4 four-new +5 five-new +DROP TABLE t_rep; +# +# ============================================ +# TEST 2: REPLACE INTO - PK + secondary index +# (verifies old secondary index entries are +# properly cleaned up) +# ============================================ +# +CREATE TABLE t_rep_idx ( +id INT NOT NULL PRIMARY KEY, +k INT NOT NULL, +val VARCHAR(50), +KEY k_idx (k) +) ENGINE=TIDESDB; +INSERT INTO t_rep_idx VALUES (1, 100, 'a'), (2, 200, 'b'), (3, 100, 'c'); +# Before REPLACE: k=100 has 2 rows +SELECT * FROM t_rep_idx WHERE k = 100 ORDER BY id; +id k val +1 100 a +3 100 c +# REPLACE id=1, changing k from 100 to 999 +REPLACE INTO t_rep_idx VALUES (1, 999, 'a-replaced'); +SELECT * FROM t_rep_idx ORDER BY id; +id k val +1 999 a-replaced +2 200 b +3 100 c +# After REPLACE: k=100 should have only 1 row (id=3) +SELECT * FROM t_rep_idx WHERE k = 100 ORDER BY id; +id k val +3 100 c +# k=999 should have 1 row (id=1) +SELECT * FROM t_rep_idx WHERE k = 999; +id k val +1 999 a-replaced +# REPLACE id=3, keeping k=100 +REPLACE INTO t_rep_idx VALUES (3, 100, 'c-replaced'); +SELECT * FROM t_rep_idx WHERE k = 100 ORDER BY id; +id k val +3 100 c-replaced +DROP TABLE t_rep_idx; +# +# ============================================ +# TEST 3: INSERT ON DUPLICATE KEY UPDATE - PK +# ============================================ +# +CREATE TABLE t_iodku ( +id INT NOT NULL PRIMARY KEY, +val INT NOT NULL DEFAULT 0 +) ENGINE=TIDESDB; +INSERT INTO t_iodku VALUES (1, 100), (2, 200), (3, 300); +SELECT * FROM t_iodku ORDER BY id; +id val +1 100 +2 200 +3 300 +# IODKU: duplicate on id=2 => update val +INSERT INTO t_iodku VALUES (2, 0) ON DUPLICATE KEY UPDATE val = val + 1; +SELECT * FROM t_iodku ORDER BY id; +id val +1 100 +2 201 +3 300 +# IODKU: no duplicate on id=4 => insert +INSERT INTO t_iodku VALUES (4, 400) ON DUPLICATE KEY UPDATE val = val + 1; +SELECT * FROM t_iodku ORDER BY id; +id val +1 100 +2 201 +3 300 +4 400 +# IODKU: multiple rows (some dups, some new) +INSERT INTO t_iodku VALUES (1, 0), (5, 500), (3, 0) +ON DUPLICATE KEY UPDATE val = val + 10; +SELECT * FROM t_iodku ORDER BY id; +id val +1 110 +2 201 +3 310 +4 400 +5 500 +DROP TABLE t_iodku; +# +# ============================================ +# TEST 4: IODKU with secondary index +# ============================================ +# +CREATE TABLE t_iodku_idx ( +id INT NOT NULL PRIMARY KEY, +k INT NOT NULL, +val VARCHAR(50), +KEY k_idx (k) +) ENGINE=TIDESDB; +INSERT INTO t_iodku_idx VALUES (1, 10, 'orig-1'), (2, 20, 'orig-2'); +# IODKU duplicate on PK, changes indexed column k +INSERT INTO t_iodku_idx VALUES (1, 99, 'new-1') +ON DUPLICATE KEY UPDATE k = VALUES(k), val = VALUES(val); +SELECT * FROM t_iodku_idx ORDER BY id; +id k val +1 99 new-1 +2 20 orig-2 +# Old k=10 should be gone, k=99 should have id=1 +SELECT * FROM t_iodku_idx WHERE k = 10; +id k val +SELECT * FROM t_iodku_idx WHERE k = 99; +id k val +1 99 new-1 +DROP TABLE t_iodku_idx; +# +# ============================================ +# TEST 5: IODKU with unique secondary index +# ============================================ +# +CREATE TABLE t_iodku_uniq ( +id INT NOT NULL PRIMARY KEY, +email VARCHAR(100) NOT NULL, +cnt INT NOT NULL DEFAULT 0, +UNIQUE KEY uk_email (email) +) ENGINE=TIDESDB; +INSERT INTO t_iodku_uniq VALUES (1, 'alice@test.com', 1); +INSERT INTO t_iodku_uniq VALUES (2, 'bob@test.com', 1); +# IODKU conflict on unique secondary index (email) +INSERT INTO t_iodku_uniq VALUES (3, 'alice@test.com', 1) +ON DUPLICATE KEY UPDATE cnt = cnt + 1; +SELECT * FROM t_iodku_uniq ORDER BY id; +id email cnt +1 alice@test.com 2 +2 bob@test.com 1 +DROP TABLE t_iodku_uniq; +# +# ============================================ +# TEST 6: REPLACE with AUTO_INCREMENT +# ============================================ +# +CREATE TABLE t_rep_auto ( +id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB; +INSERT INTO t_rep_auto (val) VALUES ('first'), ('second'), ('third'); +SELECT * FROM t_rep_auto ORDER BY id; +id val +1 first +2 second +3 third +REPLACE INTO t_rep_auto VALUES (2, 'second-replaced'); +SELECT * FROM t_rep_auto ORDER BY id; +id val +1 first +2 second-replaced +3 third +# Next auto_inc should be > 3 +INSERT INTO t_rep_auto (val) VALUES ('fourth'); +SELECT * FROM t_rep_auto ORDER BY id; +id val +1 first +2 second-replaced +3 third +4 fourth +DROP TABLE t_rep_auto; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_savepoint.result b/mysql-test/suite/tidesdb/r/tidesdb_savepoint.result new file mode 100644 index 0000000000000..a7f2ea4f7bc28 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_savepoint.result @@ -0,0 +1,25 @@ +# +# ============================================ +# TEST: SQL SAVEPOINT support +# ============================================ +# +CREATE TABLE t_sp ( +id INT PRIMARY KEY, +v INT +) ENGINE=TIDESDB; +# SAVEPOINT should work inside an explicit transaction +START TRANSACTION; +INSERT INTO t_sp VALUES (1, 10); +SAVEPOINT a; +INSERT INTO t_sp VALUES (2, 20); +ROLLBACK TO SAVEPOINT a; +INSERT INTO t_sp VALUES (3, 30); +RELEASE SAVEPOINT a; +COMMIT; +SELECT * FROM t_sp ORDER BY id; +id v +1 10 +3 30 +DROP TABLE t_sp; +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_single_delete.result b/mysql-test/suite/tidesdb/r/tidesdb_single_delete.result new file mode 100644 index 0000000000000..42bc42614dfd1 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_single_delete.result @@ -0,0 +1,226 @@ +# +# === sysvar: default is OFF === +# +SHOW VARIABLES LIKE 'tidesdb_single_delete_primary'; +Variable_name Value +tidesdb_single_delete_primary OFF +SELECT @@SESSION.tidesdb_single_delete_primary; +@@SESSION.tidesdb_single_delete_primary +0 +# +# === Secondary-index single-delete is always on (no flag needed). === +# Reads must remain correct across INSERT, SELECT, UPDATE, DELETE on a +# table with multiple secondary indexes. This exercises update_row's +# old-entry delete path and delete_row's secondary-index dispatch loop. +# +CREATE TABLE t_sec ( +pk BIGINT PRIMARY KEY, +c0 INT, +c1 INT, +c2 INT, +KEY k0 (c0), +KEY k1 (c1), +KEY k2 (c2) +) ENGINE=TIDESDB; +INSERT INTO t_sec VALUES (1,10,100,1000),(2,20,200,2000),(3,30,300,3000); +SELECT * FROM t_sec ORDER BY pk; +pk c0 c1 c2 +1 10 100 1000 +2 20 200 2000 +3 30 300 3000 +SELECT pk FROM t_sec WHERE c0 = 20; +pk +2 +SELECT pk FROM t_sec WHERE c1 = 300; +pk +3 +SELECT pk FROM t_sec WHERE c2 = 1000; +pk +1 +UPDATE t_sec SET c0 = 11, c1 = 111 WHERE pk = 1; +SELECT * FROM t_sec ORDER BY pk; +pk c0 c1 c2 +1 11 111 1000 +2 20 200 2000 +3 30 300 3000 +SELECT pk FROM t_sec WHERE c0 = 10; +pk +SELECT pk FROM t_sec WHERE c0 = 11; +pk +1 +SELECT pk FROM t_sec WHERE c1 = 100; +pk +SELECT pk FROM t_sec WHERE c1 = 111; +pk +1 +DELETE FROM t_sec WHERE pk = 2; +SELECT * FROM t_sec ORDER BY pk; +pk c0 c1 c2 +1 11 111 1000 +3 30 300 3000 +SELECT pk FROM t_sec WHERE c0 = 20; +pk +SELECT pk FROM t_sec WHERE c1 = 200; +pk +DELETE FROM t_sec; +SELECT COUNT(*) FROM t_sec; +COUNT(*) +0 +# +# REPLACE INTO on a table with secondary indexes: the server routes +# through delete_row + write_row, so each specific (col_vals, pk) is +# still put-once-delete-once. Secondary-index single-delete stays +# safe. +# +INSERT INTO t_sec VALUES (5,50,500,5000); +REPLACE INTO t_sec VALUES (5,55,555,5555); +SELECT * FROM t_sec WHERE pk = 5; +pk c0 c1 c2 +5 55 555 5555 +SELECT pk FROM t_sec WHERE c0 = 50; +pk +SELECT pk FROM t_sec WHERE c0 = 55; +pk +5 +DROP TABLE t_sec; +# +# === Primary-CF single-delete under the sysvar: insert-then-delete. === +# The contract holds because we only INSERT and DELETE -- no UPDATE, +# no REPLACE. Reads must agree with the non-sysvar baseline. +# +SET SESSION tidesdb_single_delete_primary = 1; +SELECT @@SESSION.tidesdb_single_delete_primary; +@@SESSION.tidesdb_single_delete_primary +1 +CREATE TABLE t_pri ( +pk BIGINT PRIMARY KEY, +v VARCHAR(32) +) ENGINE=TIDESDB; +INSERT INTO t_pri VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e'); +SELECT * FROM t_pri ORDER BY pk; +pk v +1 a +2 b +3 c +4 d +5 e +DELETE FROM t_pri WHERE pk IN (2,4); +SELECT * FROM t_pri ORDER BY pk; +pk v +1 a +3 c +5 e +DELETE FROM t_pri; +SELECT COUNT(*) FROM t_pri; +COUNT(*) +0 +# +# Insert a fresh batch, delete every row, read nothing back. This +# matches the iibench-shaped workload. +# +INSERT INTO t_pri VALUES (10,'x'),(20,'y'),(30,'z'),(40,'w'),(50,'v'); +SELECT COUNT(*) FROM t_pri; +COUNT(*) +5 +DELETE FROM t_pri; +SELECT COUNT(*) FROM t_pri; +COUNT(*) +0 +DROP TABLE t_pri; +# +# === Primary-CF single-delete with secondary indexes present. === +# Secondary-index SD is already unconditional; primary-CF SD is gated +# on the sysvar. Together they cover all four CFs per delete on +# Mark's num_secondary_indexes=3 table shape. +# +CREATE TABLE t_mark ( +transactionid BIGINT PRIMARY KEY, +c0 INT, +c1 INT, +c2 INT, +KEY (c0), +KEY (c1), +KEY (c2) +) ENGINE=TIDESDB; +INSERT INTO t_mark VALUES (1,10,100,1000),(2,20,200,2000),(3,30,300,3000), +(4,40,400,4000),(5,50,500,5000); +SELECT COUNT(*) FROM t_mark; +COUNT(*) +5 +SELECT transactionid FROM t_mark WHERE c1 = 300; +transactionid +3 +DELETE FROM t_mark WHERE transactionid >= 2 ORDER BY transactionid ASC LIMIT 2; +SELECT transactionid FROM t_mark ORDER BY transactionid; +transactionid +1 +4 +5 +SELECT transactionid FROM t_mark WHERE c0 = 20; +transactionid +SELECT transactionid FROM t_mark WHERE c2 = 3000; +transactionid +DELETE FROM t_mark; +SELECT COUNT(*) FROM t_mark; +COUNT(*) +0 +DROP TABLE t_mark; +SET SESSION tidesdb_single_delete_primary = 0; +# +# === Sysvar OFF across UPDATE + REPLACE paths (safety baseline). === +# Any workload that uses UPDATE non-PK / REPLACE INTO on no-secondary +# tables must stay correct with the sysvar OFF, because primary-CF SD +# is unsafe under those patterns. Secondary-index SD is independent +# of the sysvar. +# +CREATE TABLE t_upd ( +pk BIGINT PRIMARY KEY, +c0 INT, +KEY (c0) +) ENGINE=TIDESDB; +INSERT INTO t_upd VALUES (1,100),(2,200),(3,300); +UPDATE t_upd SET c0 = 999 WHERE pk = 2; +SELECT * FROM t_upd ORDER BY pk; +pk c0 +1 100 +2 999 +3 300 +SELECT pk FROM t_upd WHERE c0 = 200; +pk +SELECT pk FROM t_upd WHERE c0 = 999; +pk +2 +DELETE FROM t_upd WHERE pk = 2; +SELECT * FROM t_upd ORDER BY pk; +pk c0 +1 100 +3 300 +SELECT pk FROM t_upd WHERE c0 = 999; +pk +DROP TABLE t_upd; +# +# REPLACE INTO on a no-secondary table follows the line-5143 "overwrite +# silently" fast path. With sysvar OFF (default), subsequent DELETEs +# remain correct because the regular tombstone is used. +# +CREATE TABLE t_rep ( +pk BIGINT PRIMARY KEY, +v VARCHAR(32) +) ENGINE=TIDESDB; +INSERT INTO t_rep VALUES (1,'first'); +REPLACE INTO t_rep VALUES (1,'second'); +SELECT * FROM t_rep; +pk v +1 second +DELETE FROM t_rep WHERE pk = 1; +SELECT COUNT(*) FROM t_rep; +COUNT(*) +0 +SELECT * FROM t_rep; +pk v +INSERT INTO t_rep VALUES (1,'third'); +SELECT * FROM t_rep; +pk v +1 third +DROP TABLE t_rep; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_spatial.result b/mysql-test/suite/tidesdb/r/tidesdb_spatial.result new file mode 100644 index 0000000000000..a08a7ed8e5cda --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_spatial.result @@ -0,0 +1,88 @@ +# +# Setup +# +CREATE TABLE places ( +id INT NOT NULL PRIMARY KEY, +name VARCHAR(100), +loc GEOMETRY NOT NULL, +SPATIAL INDEX (loc) +) ENGINE=TidesDB; +INSERT INTO places VALUES (1, 'NYC', ST_GeomFromText('POINT(40.7128 -74.0060)')); +INSERT INTO places VALUES (2, 'LA', ST_GeomFromText('POINT(34.0522 -118.2437)')); +INSERT INTO places VALUES (3, 'Chicago', ST_GeomFromText('POINT(41.8781 -87.6298)')); +INSERT INTO places VALUES (4, 'Houston', ST_GeomFromText('POINT(29.7604 -95.3698)')); +INSERT INTO places VALUES (5, 'Phoenix', ST_GeomFromText('POINT(33.4484 -112.074)')); +# +# TEST 1: MBRIntersects - find cities near northeast US +# +SELECT name FROM places +WHERE MBRIntersects(loc, +ST_GeomFromText('POLYGON((39 -76, 43 -76, 43 -72, 39 -72, 39 -76))')) +ORDER BY name; +name +NYC +# +# TEST 2: MBRContains - all cities within big US box +# +SELECT name FROM places +WHERE MBRContains( +ST_GeomFromText('POLYGON((25 -125, 45 -125, 45 -70, 25 -70, 25 -125))'), +loc) +ORDER BY name; +name +Chicago +Houston +LA +NYC +Phoenix +# +# TEST 3: MBRWithin - same as above using MBRWithin +# +SELECT name FROM places +WHERE MBRWithin(loc, +ST_GeomFromText('POLYGON((25 -125, 45 -125, 45 -70, 25 -70, 25 -125))')) +ORDER BY name; +name +Chicago +Houston +LA +NYC +Phoenix +# +# TEST 4: UPDATE geometry and verify search +# +UPDATE places SET loc = ST_GeomFromText('POINT(40.0 -74.5)') WHERE id = 1; +SELECT name FROM places +WHERE MBRIntersects(loc, +ST_GeomFromText('POLYGON((39 -76, 43 -76, 43 -72, 39 -72, 39 -76))')) +ORDER BY name; +name +NYC +# +# TEST 5: DELETE and verify search +# +DELETE FROM places WHERE id = 1; +SELECT name FROM places +WHERE MBRIntersects(loc, +ST_GeomFromText('POLYGON((39 -76, 43 -76, 43 -72, 39 -72, 39 -76))')) +ORDER BY name; +name +# +# TEST 6: Simple point-in-box +# +DROP TABLE places; +CREATE TABLE pts (id INT PRIMARY KEY, g GEOMETRY NOT NULL, SPATIAL INDEX(g)) ENGINE=TidesDB; +INSERT INTO pts VALUES (1, ST_GeomFromText('POINT(10 20)')); +INSERT INTO pts VALUES (2, ST_GeomFromText('POINT(30 40)')); +INSERT INTO pts VALUES (3, ST_GeomFromText('POINT(50 60)')); +SELECT id FROM pts +WHERE MBRWithin(g, ST_GeomFromText('POLYGON((5 15, 35 15, 35 45, 5 45, 5 15))')) +ORDER BY id; +id +1 +2 +# +# Cleanup +# +DROP TABLE pts; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_sql.result b/mysql-test/suite/tidesdb/r/tidesdb_sql.result new file mode 100644 index 0000000000000..906b0ef57cffa --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_sql.result @@ -0,0 +1,813 @@ +# +# ============================================ +# SETUP: Create and populate test tables +# ============================================ +# +CREATE TABLE departments ( +dept_id INT PRIMARY KEY, +dept_name VARCHAR(50) NOT NULL +) ENGINE=TIDESDB; +CREATE TABLE employees ( +emp_id INT PRIMARY KEY, +name VARCHAR(100) NOT NULL, +dept_id INT NOT NULL, +salary DECIMAL(10,2) NOT NULL, +hire_date DATE NOT NULL, +KEY idx_dept (dept_id), +KEY idx_salary (salary) +) ENGINE=TIDESDB; +CREATE TABLE projects ( +proj_id INT PRIMARY KEY, +proj_name VARCHAR(100) NOT NULL, +dept_id INT NOT NULL, +budget DECIMAL(12,2) NOT NULL, +KEY idx_proj_dept (dept_id) +) ENGINE=TIDESDB; +CREATE TABLE emp_projects ( +emp_id INT NOT NULL, +proj_id INT NOT NULL, +hours INT NOT NULL, +PRIMARY KEY (emp_id, proj_id) +) ENGINE=TIDESDB; +INSERT INTO departments VALUES +(1, 'Engineering'), +(2, 'Marketing'), +(3, 'Finance'), +(4, 'HR'); +INSERT INTO employees VALUES +(1, 'Alice', 1, 95000.00, '2020-01-15'), +(2, 'Bob', 1, 88000.00, '2019-06-01'), +(3, 'Carol', 2, 72000.00, '2021-03-10'), +(4, 'Dave', 2, 68000.00, '2022-07-20'), +(5, 'Eve', 3, 105000.00, '2018-11-05'), +(6, 'Frank', 3, 92000.00, '2020-09-12'), +(7, 'Grace', 1, 78000.00, '2023-01-08'), +(8, 'Hank', 4, 65000.00, '2021-05-25'), +(9, 'Ivy', 2, 71000.00, '2020-12-01'), +(10, 'Jack', 3, 85000.00, '2022-02-14'); +INSERT INTO projects VALUES +(100, 'Project Alpha', 1, 500000.00), +(101, 'Project Beta', 1, 300000.00), +(102, 'Campaign X', 2, 150000.00), +(103, 'Audit 2024', 3, 200000.00), +(104, 'Onboarding', 4, 50000.00); +INSERT INTO emp_projects VALUES +(1, 100, 40), (1, 101, 20), +(2, 100, 35), (2, 101, 25), +(3, 102, 45), +(4, 102, 30), +(5, 103, 50), +(6, 103, 25), +(7, 100, 15), (7, 101, 30), +(8, 104, 40), +(9, 102, 20), +(10, 103, 35); +# +# ============================================ +# TEST 1: Basic aggregate functions +# ============================================ +# +SELECT COUNT(*) AS total_employees FROM employees; +total_employees +10 +SELECT SUM(salary) AS total_salary FROM employees; +total_salary +819000.00 +SELECT AVG(salary) AS avg_salary FROM employees; +avg_salary +81900.000000 +SELECT MIN(salary) AS min_salary, MAX(salary) AS max_salary FROM employees; +min_salary max_salary +65000.00 105000.00 +SELECT MIN(hire_date) AS earliest_hire, MAX(hire_date) AS latest_hire FROM employees; +earliest_hire latest_hire +2018-11-05 2023-01-08 +# +# ============================================ +# TEST 2: GROUP BY +# ============================================ +# +SELECT dept_id, COUNT(*) AS cnt, SUM(salary) AS total_sal +FROM employees +GROUP BY dept_id +ORDER BY dept_id; +dept_id cnt total_sal +1 3 261000.00 +2 3 211000.00 +3 3 282000.00 +4 1 65000.00 +SELECT dept_id, AVG(salary) AS avg_sal, MIN(salary) AS min_sal, MAX(salary) AS max_sal +FROM employees +GROUP BY dept_id +ORDER BY dept_id; +dept_id avg_sal min_sal max_sal +1 87000.000000 78000.00 95000.00 +2 70333.333333 68000.00 72000.00 +3 94000.000000 85000.00 105000.00 +4 65000.000000 65000.00 65000.00 +# +# ============================================ +# TEST 3: GROUP BY with HAVING +# ============================================ +# +SELECT dept_id, COUNT(*) AS cnt +FROM employees +GROUP BY dept_id +HAVING cnt >= 3 +ORDER BY dept_id; +dept_id cnt +1 3 +2 3 +3 3 +SELECT dept_id, AVG(salary) AS avg_sal +FROM employees +GROUP BY dept_id +HAVING avg_sal > 80000 +ORDER BY dept_id; +dept_id avg_sal +1 87000.000000 +3 94000.000000 +# +# ============================================ +# TEST 4: INNER JOIN +# ============================================ +# +SELECT e.name, d.dept_name, e.salary +FROM employees e +INNER JOIN departments d ON e.dept_id = d.dept_id +ORDER BY e.emp_id; +name dept_name salary +Alice Engineering 95000.00 +Bob Engineering 88000.00 +Carol Marketing 72000.00 +Dave Marketing 68000.00 +Eve Finance 105000.00 +Frank Finance 92000.00 +Grace Engineering 78000.00 +Hank HR 65000.00 +Ivy Marketing 71000.00 +Jack Finance 85000.00 +# +# ============================================ +# TEST 5: LEFT JOIN +# ============================================ +# +SELECT d.dept_name, e.name +FROM departments d +LEFT JOIN employees e ON d.dept_id = e.dept_id AND e.salary > 90000 +ORDER BY d.dept_id, e.emp_id; +dept_name name +Engineering Alice +Marketing NULL +Finance Eve +Finance Frank +HR NULL +# +# ============================================ +# TEST 6: RIGHT JOIN +# ============================================ +# +SELECT e.name, d.dept_name +FROM departments d +RIGHT JOIN employees e ON d.dept_id = e.dept_id +ORDER BY e.emp_id; +name dept_name +Alice Engineering +Bob Engineering +Carol Marketing +Dave Marketing +Eve Finance +Frank Finance +Grace Engineering +Hank HR +Ivy Marketing +Jack Finance +# +# ============================================ +# TEST 7: CROSS JOIN +# ============================================ +# +SELECT d.dept_name, p.proj_name +FROM departments d +CROSS JOIN projects p +WHERE d.dept_id = p.dept_id +ORDER BY d.dept_id, p.proj_id; +dept_name proj_name +Engineering Project Alpha +Engineering Project Beta +Marketing Campaign X +Finance Audit 2024 +HR Onboarding +# +# ============================================ +# TEST 8: Multi-table JOIN (3 tables) +# ============================================ +# +SELECT e.name, d.dept_name, p.proj_name, ep.hours +FROM employees e +JOIN departments d ON e.dept_id = d.dept_id +JOIN emp_projects ep ON e.emp_id = ep.emp_id +JOIN projects p ON ep.proj_id = p.proj_id +ORDER BY e.emp_id, p.proj_id; +name dept_name proj_name hours +Alice Engineering Project Alpha 40 +Alice Engineering Project Beta 20 +Bob Engineering Project Alpha 35 +Bob Engineering Project Beta 25 +Carol Marketing Campaign X 45 +Dave Marketing Campaign X 30 +Eve Finance Audit 2024 50 +Frank Finance Audit 2024 25 +Grace Engineering Project Alpha 15 +Grace Engineering Project Beta 30 +Hank HR Onboarding 40 +Ivy Marketing Campaign X 20 +Jack Finance Audit 2024 35 +# +# ============================================ +# TEST 9: JOIN with aggregation +# ============================================ +# +SELECT d.dept_name, COUNT(e.emp_id) AS headcount, SUM(e.salary) AS total_sal +FROM departments d +LEFT JOIN employees e ON d.dept_id = e.dept_id +GROUP BY d.dept_id, d.dept_name +ORDER BY d.dept_id; +dept_name headcount total_sal +Engineering 3 261000.00 +Marketing 3 211000.00 +Finance 3 282000.00 +HR 1 65000.00 +# +# ============================================ +# TEST 10: Scalar subquery +# ============================================ +# +SELECT name, salary, +salary - (SELECT AVG(salary) FROM employees) AS diff_from_avg +FROM employees +ORDER BY emp_id; +name salary diff_from_avg +Alice 95000.00 13100.000000 +Bob 88000.00 6100.000000 +Carol 72000.00 -9900.000000 +Dave 68000.00 -13900.000000 +Eve 105000.00 23100.000000 +Frank 92000.00 10100.000000 +Grace 78000.00 -3900.000000 +Hank 65000.00 -16900.000000 +Ivy 71000.00 -10900.000000 +Jack 85000.00 3100.000000 +# +# ============================================ +# TEST 11: IN subquery +# ============================================ +# +SELECT name, salary +FROM employees +WHERE dept_id IN (SELECT dept_id FROM departments WHERE dept_name IN ('Engineering', 'Finance')) +ORDER BY emp_id; +name salary +Alice 95000.00 +Bob 88000.00 +Eve 105000.00 +Frank 92000.00 +Grace 78000.00 +Jack 85000.00 +# +# ============================================ +# TEST 12: EXISTS subquery +# ============================================ +# +SELECT d.dept_name +FROM departments d +WHERE EXISTS (SELECT 1 FROM employees e WHERE e.dept_id = d.dept_id AND e.salary > 90000) +ORDER BY d.dept_id; +dept_name +Engineering +Finance +# +# ============================================ +# TEST 13: NOT EXISTS subquery +# ============================================ +# +SELECT d.dept_name +FROM departments d +WHERE NOT EXISTS (SELECT 1 FROM projects p WHERE p.dept_id = d.dept_id AND p.budget > 400000) +ORDER BY d.dept_id; +dept_name +Marketing +Finance +HR +# +# ============================================ +# TEST 14: Correlated subquery +# ============================================ +# +SELECT e.name, e.salary, e.dept_id +FROM employees e +WHERE e.salary = (SELECT MAX(e2.salary) FROM employees e2 WHERE e2.dept_id = e.dept_id) +ORDER BY e.dept_id; +name salary dept_id +Alice 95000.00 1 +Carol 72000.00 2 +Eve 105000.00 3 +Hank 65000.00 4 +# +# ============================================ +# TEST 15: Derived table (subquery in FROM) +# ============================================ +# +SELECT dept_id, avg_sal +FROM ( +SELECT dept_id, AVG(salary) AS avg_sal +FROM employees +GROUP BY dept_id +) AS dept_avg +WHERE avg_sal > 80000 +ORDER BY dept_id; +dept_id avg_sal +1 87000.000000 +3 94000.000000 +# +# ============================================ +# TEST 16: UNION / UNION ALL +# ============================================ +# +SELECT name, 'high' AS tier FROM employees WHERE salary >= 90000 +UNION ALL +SELECT name, 'low' AS tier FROM employees WHERE salary < 70000 +ORDER BY name; +name tier +Alice high +Dave low +Eve high +Frank high +Hank low +SELECT dept_id FROM employees +UNION +SELECT dept_id FROM projects +ORDER BY dept_id; +dept_id +1 +2 +3 +4 +# +# ============================================ +# TEST 17: DISTINCT +# ============================================ +# +SELECT DISTINCT dept_id FROM employees ORDER BY dept_id; +dept_id +1 +2 +3 +4 +SELECT COUNT(DISTINCT dept_id) AS unique_depts FROM employees; +unique_depts +4 +# +# ============================================ +# TEST 18: ORDER BY with LIMIT / OFFSET +# ============================================ +# +SELECT name, salary FROM employees ORDER BY salary DESC LIMIT 3; +name salary +Eve 105000.00 +Alice 95000.00 +Frank 92000.00 +SELECT name, salary FROM employees ORDER BY salary DESC LIMIT 3 OFFSET 3; +name salary +Bob 88000.00 +Jack 85000.00 +Grace 78000.00 +# +# ============================================ +# TEST 19: CASE expression +# ============================================ +# +SELECT name, salary, +CASE +WHEN salary >= 100000 THEN 'Senior' + WHEN salary >= 80000 THEN 'Mid' + ELSE 'Junior' + END AS level +FROM employees +ORDER BY emp_id; +name salary level +Alice 95000.00 Mid +Bob 88000.00 Mid +Carol 72000.00 Junior +Dave 68000.00 Junior +Eve 105000.00 Senior +Frank 92000.00 Mid +Grace 78000.00 Junior +Hank 65000.00 Junior +Ivy 71000.00 Junior +Jack 85000.00 Mid +# +# ============================================ +# TEST 20: INSERT ... SELECT +# ============================================ +# +CREATE TABLE high_earners ( +emp_id INT PRIMARY KEY, +name VARCHAR(100), +salary DECIMAL(10,2) +) ENGINE=TIDESDB; +INSERT INTO high_earners +SELECT emp_id, name, salary FROM employees WHERE salary >= 90000; +SELECT * FROM high_earners ORDER BY emp_id; +emp_id name salary +1 Alice 95000.00 +5 Eve 105000.00 +6 Frank 92000.00 +DROP TABLE high_earners; +# +# ============================================ +# TEST 21: UPDATE with subquery +# ============================================ +# +CREATE TABLE emp_copy AS SELECT * FROM employees; +ALTER TABLE emp_copy ENGINE=TIDESDB; +UPDATE emp_copy SET salary = salary * 1.10 +WHERE dept_id = (SELECT dept_id FROM departments WHERE dept_name = 'Marketing'); +SELECT emp_id, name, salary FROM emp_copy WHERE dept_id = 2 ORDER BY emp_id; +emp_id name salary +3 Carol 79200.00 +4 Dave 74800.00 +9 Ivy 78100.00 +DROP TABLE emp_copy; +# +# ============================================ +# TEST 22: DELETE with subquery +# ============================================ +# +CREATE TABLE emp_copy2 AS SELECT * FROM employees; +ALTER TABLE emp_copy2 ENGINE=TIDESDB; +DELETE FROM emp_copy2 +WHERE dept_id NOT IN (SELECT dept_id FROM departments WHERE dept_name IN ('Engineering', 'Finance')); +SELECT emp_id, name FROM emp_copy2 ORDER BY emp_id; +emp_id name +1 Alice +2 Bob +5 Eve +6 Frank +7 Grace +10 Jack +DROP TABLE emp_copy2; +# +# ============================================ +# TEST 23: REPLACE INTO +# ============================================ +# +CREATE TABLE kv_store ( +k VARCHAR(50) PRIMARY KEY, +v VARCHAR(200) +) ENGINE=TIDESDB; +INSERT INTO kv_store VALUES ('key1', 'original'); +REPLACE INTO kv_store VALUES ('key1', 'replaced'); +REPLACE INTO kv_store VALUES ('key2', 'new'); +SELECT * FROM kv_store ORDER BY k; +k v +key1 replaced +key2 new +DROP TABLE kv_store; +# +# ============================================ +# TEST 24: Multi-column ORDER BY +# ============================================ +# +SELECT dept_id, name, salary +FROM employees +ORDER BY dept_id ASC, salary DESC; +dept_id name salary +1 Alice 95000.00 +1 Bob 88000.00 +1 Grace 78000.00 +2 Carol 72000.00 +2 Ivy 71000.00 +2 Dave 68000.00 +3 Eve 105000.00 +3 Frank 92000.00 +3 Jack 85000.00 +4 Hank 65000.00 +# +# ============================================ +# TEST 25: GROUP_CONCAT +# ============================================ +# +SELECT dept_id, GROUP_CONCAT(name ORDER BY name SEPARATOR ', ') AS members +FROM employees +GROUP BY dept_id +ORDER BY dept_id; +dept_id members +1 Alice, Bob, Grace +2 Carol, Dave, Ivy +3 Eve, Frank, Jack +4 Hank +# +# ============================================ +# TEST 26: BETWEEN / IN / LIKE +# ============================================ +# +SELECT name, salary FROM employees WHERE salary BETWEEN 70000 AND 90000 ORDER BY emp_id; +name salary +Bob 88000.00 +Carol 72000.00 +Grace 78000.00 +Ivy 71000.00 +Jack 85000.00 +SELECT name FROM employees WHERE name LIKE '%a%' ORDER BY emp_id; +name +Alice +Carol +Dave +Frank +Grace +Hank +Jack +SELECT name FROM employees WHERE emp_id IN (1, 3, 5, 7, 9) ORDER BY emp_id; +name +Alice +Carol +Eve +Grace +Ivy +# +# ============================================ +# TEST 27: NULL handling +# ============================================ +# +CREATE TABLE nullable_test ( +id INT PRIMARY KEY, +val VARCHAR(50), +num INT +) ENGINE=TIDESDB; +INSERT INTO nullable_test VALUES (1, 'hello', 10), (2, NULL, 20), (3, 'world', NULL), (4, NULL, NULL); +SELECT * FROM nullable_test ORDER BY id; +id val num +1 hello 10 +2 NULL 20 +3 world NULL +4 NULL NULL +SELECT * FROM nullable_test WHERE val IS NULL ORDER BY id; +id val num +2 NULL 20 +4 NULL NULL +SELECT * FROM nullable_test WHERE num IS NOT NULL ORDER BY id; +id val num +1 hello 10 +2 NULL 20 +SELECT COUNT(*) AS total, COUNT(val) AS non_null_val, COUNT(num) AS non_null_num FROM nullable_test; +total non_null_val non_null_num +4 2 2 +SELECT COALESCE(val, 'N/A') AS val_or_na, COALESCE(num, 0) AS num_or_zero FROM nullable_test ORDER BY id; +val_or_na num_or_zero +hello 10 +N/A 20 +world 0 +N/A 0 +DROP TABLE nullable_test; +# +# ============================================ +# TEST 28: Self-join +# ============================================ +# +SELECT e1.name AS employee, e2.name AS colleague +FROM employees e1 +JOIN employees e2 ON e1.dept_id = e2.dept_id AND e1.emp_id < e2.emp_id +WHERE e1.dept_id = 1 +ORDER BY e1.emp_id, e2.emp_id; +employee colleague +Alice Bob +Alice Grace +Bob Grace +# +# ============================================ +# TEST 29: Aggregate with JOIN and GROUP BY +# ============================================ +# +SELECT p.proj_name, COUNT(ep.emp_id) AS team_size, SUM(ep.hours) AS total_hours +FROM projects p +LEFT JOIN emp_projects ep ON p.proj_id = ep.proj_id +GROUP BY p.proj_id, p.proj_name +ORDER BY p.proj_id; +proj_name team_size total_hours +Project Alpha 3 90 +Project Beta 3 75 +Campaign X 3 95 +Audit 2024 3 110 +Onboarding 1 40 +# +# ============================================ +# TEST 30: Nested aggregation (max of avg) +# ============================================ +# +SELECT dept_id, avg_sal FROM ( +SELECT dept_id, AVG(salary) AS avg_sal +FROM employees +GROUP BY dept_id +) t +WHERE avg_sal = ( +SELECT MAX(avg_sal) FROM ( +SELECT AVG(salary) AS avg_sal FROM employees GROUP BY dept_id +) t2 +); +dept_id avg_sal +3 94000.000000 +# +# ============================================ +# TEST 31: UNION with ORDER BY and LIMIT +# ============================================ +# +(SELECT name, salary FROM employees WHERE dept_id = 1 ORDER BY salary DESC LIMIT 2) +UNION ALL +(SELECT name, salary FROM employees WHERE dept_id = 3 ORDER BY salary DESC LIMIT 2) +ORDER BY salary DESC; +name salary +Eve 105000.00 +Alice 95000.00 +Frank 92000.00 +Bob 88000.00 +# +# ============================================ +# TEST 32: Multi-statement transaction +# ============================================ +# +BEGIN; +INSERT INTO employees VALUES (11, 'Kim', 1, 99000.00, '2024-01-01'); +UPDATE employees SET salary = salary + 1000 WHERE emp_id = 11; +SELECT emp_id, name, salary FROM employees WHERE emp_id = 11; +emp_id name salary +11 Kim 100000.00 +COMMIT; +SELECT emp_id, name, salary FROM employees WHERE emp_id = 11; +emp_id name salary +11 Kim 100000.00 +DELETE FROM employees WHERE emp_id = 11; +# +# ============================================ +# TEST 33: Transaction ROLLBACK +# ============================================ +# +BEGIN; +INSERT INTO employees VALUES (12, 'Leo', 2, 77000.00, '2024-02-01'); +SELECT COUNT(*) AS cnt_with_leo FROM employees WHERE emp_id = 12; +cnt_with_leo +1 +ROLLBACK; +SELECT COUNT(*) AS cnt_after_rollback FROM employees WHERE emp_id = 12; +cnt_after_rollback +0 +# +# ============================================ +# TEST 34: IF / IFNULL / NULLIF functions +# ============================================ +# +SELECT name, +IF(salary > 90000, 'Y', 'N') AS high_earner, +NULLIF(dept_id, 4) AS dept_or_null +FROM employees +ORDER BY emp_id; +name high_earner dept_or_null +Alice Y 1 +Bob N 1 +Carol N 2 +Dave N 2 +Eve Y 3 +Frank Y 3 +Grace N 1 +Hank N NULL +Ivy N 2 +Jack N 3 +# +# ============================================ +# TEST 35: String functions +# ============================================ +# +SELECT name, +UPPER(name) AS upper_name, +LENGTH(name) AS name_len, +CONCAT(name, ' (', dept_id, ')') AS name_dept +FROM employees +ORDER BY emp_id +LIMIT 5; +name upper_name name_len name_dept +Alice ALICE 5 Alice (1) +Bob BOB 3 Bob (1) +Carol CAROL 5 Carol (2) +Dave DAVE 4 Dave (2) +Eve EVE 3 Eve (3) +# +# ============================================ +# TEST 36: Date functions +# ============================================ +# +SELECT name, hire_date, +YEAR(hire_date) AS hire_year, +MONTH(hire_date) AS hire_month +FROM employees +ORDER BY emp_id +LIMIT 5; +name hire_date hire_year hire_month +Alice 2020-01-15 2020 1 +Bob 2019-06-01 2019 6 +Carol 2021-03-10 2021 3 +Dave 2022-07-20 2022 7 +Eve 2018-11-05 2018 11 +SELECT YEAR(hire_date) AS yr, COUNT(*) AS hired +FROM employees +GROUP BY yr +ORDER BY yr; +yr hired +2018 1 +2019 1 +2020 3 +2021 2 +2022 2 +2023 1 +# +# ============================================ +# TEST 37: Arithmetic expressions +# ============================================ +# +SELECT name, salary, +salary * 12 AS annual, +ROUND(salary / 160, 2) AS hourly_rate +FROM employees +ORDER BY emp_id +LIMIT 5; +name salary annual hourly_rate +Alice 95000.00 1140000.00 593.75 +Bob 88000.00 1056000.00 550.00 +Carol 72000.00 864000.00 450.00 +Dave 68000.00 816000.00 425.00 +Eve 105000.00 1260000.00 656.25 +# +# ============================================ +# TEST 38: HAVING with complex condition +# ============================================ +# +SELECT d.dept_name, COUNT(*) AS cnt, AVG(e.salary) AS avg_sal +FROM employees e +JOIN departments d ON e.dept_id = d.dept_id +GROUP BY d.dept_id, d.dept_name +HAVING cnt >= 2 AND avg_sal > 75000 +ORDER BY d.dept_id; +dept_name cnt avg_sal +Engineering 3 87000.000000 +Finance 3 94000.000000 +# +# ============================================ +# TEST 39: ALL / ANY subquery +# ============================================ +# +SELECT name, salary +FROM employees +WHERE salary > ALL (SELECT salary FROM employees WHERE dept_id = 2) +ORDER BY emp_id; +name salary +Alice 95000.00 +Bob 88000.00 +Eve 105000.00 +Frank 92000.00 +Grace 78000.00 +Jack 85000.00 +SELECT name, salary +FROM employees +WHERE salary > ANY (SELECT salary FROM employees WHERE dept_id = 1) +ORDER BY emp_id; +name salary +Alice 95000.00 +Bob 88000.00 +Eve 105000.00 +Frank 92000.00 +Jack 85000.00 +# +# ============================================ +# TEST 40: CREATE TABLE ... AS SELECT +# ============================================ +# +CREATE TABLE dept_summary ENGINE=TIDESDB AS +SELECT d.dept_id, d.dept_name, COUNT(e.emp_id) AS headcount, SUM(e.salary) AS total_sal +FROM departments d +LEFT JOIN employees e ON d.dept_id = e.dept_id +GROUP BY d.dept_id, d.dept_name; +SELECT * FROM dept_summary ORDER BY dept_id; +dept_id dept_name headcount total_sal +1 Engineering 3 261000.00 +2 Marketing 3 211000.00 +3 Finance 3 282000.00 +4 HR 1 65000.00 +DROP TABLE dept_summary; +# +# ============================================ +# CLEANUP +# ============================================ +# +DROP TABLE emp_projects; +DROP TABLE projects; +DROP TABLE employees; +DROP TABLE departments; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_status_vars.result b/mysql-test/suite/tidesdb/r/tidesdb_status_vars.result new file mode 100644 index 0000000000000..1666324df09fa --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_status_vars.result @@ -0,0 +1,75 @@ +# +# TEST 1: Status variables exist +# +SELECT COUNT(*) >= 19 AS has_all_vars FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME LIKE 'TIDESDB%'; +has_all_vars +1 +# +# TEST 2: Variables have reasonable values after table operations +# +CREATE TABLE t_stat (id INT PRIMARY KEY, v VARCHAR(200)) ENGINE=TidesDB; +INSERT INTO t_stat VALUES (1, REPEAT('A', 100)), (2, REPEAT('B', 100)); +SELECT * FROM t_stat ORDER BY id; +id v +1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +2 BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +SHOW ENGINE TIDESDB STATUS; +SELECT VARIABLE_VALUE > 0 AS cf_positive FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME = 'TIDESDB_COLUMN_FAMILIES'; +cf_positive +1 +SELECT VARIABLE_VALUE > 0 AS mem_positive FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME = 'TIDESDB_MEMORY_LIMIT'; +mem_positive +1 +SELECT VARIABLE_VALUE > 0 AS parts_positive FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME = 'TIDESDB_CACHE_PARTITIONS'; +parts_positive +1 +# +# TEST 3: All variable names are correct +# +SELECT VARIABLE_NAME FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME LIKE 'TIDESDB%' ORDER BY VARIABLE_NAME; +VARIABLE_NAME +TIDESDB_BACKPRESSURE_WAITS +TIDESDB_BACKPRESSURE_WAIT_US +TIDESDB_CACHE_BYTES +TIDESDB_CACHE_ENTRIES +TIDESDB_CACHE_HITS +TIDESDB_CACHE_HIT_RATE +TIDESDB_CACHE_MISSES +TIDESDB_CACHE_PARTITIONS +TIDESDB_COLUMN_FAMILIES +TIDESDB_COMPACTION_QUEUE +TIDESDB_DATA_SIZE_BYTES +TIDESDB_FLUSH_PENDING +TIDESDB_FLUSH_QUEUE +TIDESDB_GLOBAL_SEQUENCE +TIDESDB_IMMUTABLE_MEMTABLES +TIDESDB_LOCK_CHAIN_MAX +TIDESDB_LOCK_DEADLOCKS +TIDESDB_LOCK_ENTRIES +TIDESDB_LOCK_ENTRY_RECYCLES +TIDESDB_LOCK_HELD +TIDESDB_LOCK_TIMEOUTS +TIDESDB_LOCK_WAITS +TIDESDB_LOCK_WAIT_US +TIDESDB_MAX_SST_TOMBSTONE_DENSITY +TIDESDB_MAX_SST_TOMBSTONE_DENSITY_LEVEL +TIDESDB_MEMORY_LIMIT +TIDESDB_MEMORY_PRESSURE +TIDESDB_MEMTABLE_BYTES +TIDESDB_OPEN_SSTABLES +TIDESDB_TOMBSTONE_RATIO +TIDESDB_TOTAL_SSTABLES +TIDESDB_TOTAL_TOMBSTONES +TIDESDB_TXN_MEMORY_BYTES +TIDESDB_VERSION +TIDESDB_VERSION_HEX +# +# Cleanup +# +DROP TABLE t_stat; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_stress.result b/mysql-test/suite/tidesdb/r/tidesdb_stress.result new file mode 100644 index 0000000000000..711b68e32fcd3 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_stress.result @@ -0,0 +1,490 @@ +# +# === Setup === +# +CREATE TABLE stress_main ( +id INT PRIMARY KEY, +val VARCHAR(200), +score INT, +KEY idx_score (score) +) ENGINE=TIDESDB; +CREATE TABLE stress_nopk ( +a INT, +b VARCHAR(100) +) ENGINE=TIDESDB; +CREATE TABLE stress_wide ( +id INT PRIMARY KEY, +c1 VARCHAR(100), +c2 VARCHAR(100), +c3 INT, +c4 BIGINT, +c5 DECIMAL(10,2), +c6 DATE, +KEY idx_c3 (c3), +KEY idx_c4 (c4) +) ENGINE=TIDESDB; +# +# ============================================ +# TEST 1: Multi-statement transaction -- deferred commit path +# Exercises: tidesdb_commit(all=false) returning early, +# iterator reuse across statements, single commit at END. +# ============================================ +# +BEGIN; +INSERT INTO stress_main VALUES (1, 'txn_row_1', 10); +INSERT INTO stress_main VALUES (2, 'txn_row_2', 20); +INSERT INTO stress_main VALUES (3, 'txn_row_3', 30); +UPDATE stress_main SET val = 'updated_in_txn' WHERE id = 2; +DELETE FROM stress_main WHERE id = 3; +SELECT COUNT(*) AS cnt FROM stress_main; +cnt +2 +COMMIT; +SELECT * FROM stress_main ORDER BY id; +id val score +1 txn_row_1 10 +2 updated_in_txn 20 +# +# ============================================ +# TEST 2: Autocommit path -- each statement commits immediately +# Exercises: tidesdb_commit(all=false) with autocommit (real commit). +# ============================================ +# +INSERT INTO stress_main VALUES (3, 'autocommit_3', 30); +INSERT INTO stress_main VALUES (4, 'autocommit_4', 40); +UPDATE stress_main SET score = score + 100; +SELECT * FROM stress_main ORDER BY id; +id val score +1 txn_row_1 110 +2 updated_in_txn 120 +3 autocommit_3 130 +4 autocommit_4 140 +# +# ============================================ +# TEST 3: Explicit ROLLBACK -- transaction-level rollback +# Exercises: tidesdb_rollback(all=true), txn_reset after rollback. +# ============================================ +# +BEGIN; +INSERT INTO stress_main VALUES (99, 'will_rollback', 999); +UPDATE stress_main SET val = 'dirty' WHERE id = 1; +SELECT COUNT(*) AS cnt FROM stress_main; +cnt +5 +ROLLBACK; +SELECT * FROM stress_main ORDER BY id; +id val score +1 txn_row_1 110 +2 updated_in_txn 120 +3 autocommit_3 130 +4 autocommit_4 140 +# +# ============================================ +# TEST 4: Mixed reads and writes in one transaction +# Exercises: iterator reuse across read+write statements, +# scan_iter surviving F_UNLCK when txn is deferred. +# ============================================ +# +BEGIN; +SELECT COUNT(*) AS before_cnt FROM stress_main; +before_cnt +4 +INSERT INTO stress_main VALUES (5, 'mixed_5', 50); +SELECT COUNT(*) AS mid_cnt FROM stress_main; +mid_cnt +5 +UPDATE stress_main SET score = 0 WHERE id = 5; +SELECT * FROM stress_main WHERE id = 5; +id val score +5 mixed_5 0 +DELETE FROM stress_main WHERE id = 4; +SELECT COUNT(*) AS after_cnt FROM stress_main; +after_cnt +4 +COMMIT; +SELECT * FROM stress_main ORDER BY id; +id val score +1 txn_row_1 110 +2 updated_in_txn 120 +3 autocommit_3 130 +5 mixed_5 0 +# +# ============================================ +# TEST 5: Secondary index scan under transaction +# Exercises: index_read_map, sec_idx_key, iterator on index CF. +# ============================================ +# +BEGIN; +INSERT INTO stress_main VALUES (6, 'idx_6', 60); +INSERT INTO stress_main VALUES (7, 'idx_7', 70); +INSERT INTO stress_main VALUES (8, 'idx_8', 60); +COMMIT; +SELECT id, val, score FROM stress_main WHERE score = 60 ORDER BY id; +id val score +6 idx_6 60 +8 idx_8 60 +SELECT id, val, score FROM stress_main WHERE score >= 100 ORDER BY id; +id val score +1 txn_row_1 110 +2 updated_in_txn 120 +3 autocommit_3 130 +SELECT id, val, score FROM stress_main WHERE score BETWEEN 50 AND 120 ORDER BY id; +id val score +1 txn_row_1 110 +2 updated_in_txn 120 +6 idx_6 60 +7 idx_7 70 +8 idx_8 60 +# +# ============================================ +# TEST 6: Hidden PK table -- exercises next_row_id generation +# ============================================ +# +BEGIN; +INSERT INTO stress_nopk VALUES (1, 'nopk_a'); +INSERT INTO stress_nopk VALUES (2, 'nopk_b'); +INSERT INTO stress_nopk VALUES (3, 'nopk_c'); +COMMIT; +SELECT * FROM stress_nopk ORDER BY a; +a b +1 nopk_a +2 nopk_b +3 nopk_c +UPDATE stress_nopk SET b = 'updated' WHERE a = 2; +SELECT * FROM stress_nopk ORDER BY a; +a b +1 nopk_a +2 updated +3 nopk_c +DELETE FROM stress_nopk WHERE a = 1; +SELECT COUNT(*) AS cnt FROM stress_nopk; +cnt +2 +# +# ============================================ +# TEST 7: Large batch insert -- memtable pressure +# Exercises: write_buffer flush, iterator over many keys. +# ============================================ +# +SELECT COUNT(*) AS cnt FROM stress_main; +cnt +507 +SELECT COUNT(*) AS high_score FROM stress_main WHERE score >= 40; +high_score +106 +# +# ============================================ +# TEST 8: Large batch in single transaction +# Exercises: many writes buffered in one txn, single commit. +# ============================================ +# +BEGIN; +COMMIT; +SELECT COUNT(*) AS cnt FROM stress_wide; +cnt +500 +SELECT COUNT(*) AS idx_match FROM stress_wide WHERE c3 = 50; +idx_match +5 +SELECT COUNT(*) AS idx_range FROM stress_wide WHERE c4 BETWEEN 10000 AND 10100; +idx_range +11 +# +# ============================================ +# TEST 9: Bulk UPDATE + DELETE in transaction +# Exercises: update_row and delete_row across many rows, +# secondary index maintenance (old key delete + new key insert). +# ============================================ +# +BEGIN; +UPDATE stress_wide SET c3 = c3 + 200 WHERE c3 < 10; +DELETE FROM stress_wide WHERE c4 > 14000; +COMMIT; +SELECT COUNT(*) AS cnt FROM stress_wide; +cnt +401 +SELECT MIN(c3) AS min_c3, MAX(c3) AS max_c3 FROM stress_wide; +min_c3 max_c3 +10 209 +# +# ============================================ +# TEST 10: TRUNCATE -- exercises delete_all_rows +# Exercises: txn rollback+free before CF drop, CF recreate, +# share->cf pointer update. +# ============================================ +# +SELECT COUNT(*) AS before_trunc FROM stress_wide; +before_trunc +401 +TRUNCATE TABLE stress_wide; +SELECT COUNT(*) AS after_trunc FROM stress_wide; +after_trunc +0 +INSERT INTO stress_wide VALUES (1, 'post_trunc', 'ok', 1, 1, 1.00, '2025-06-01'); +SELECT * FROM stress_wide; +id c1 c2 c3 c4 c5 c6 +1 post_trunc ok 1 1 1.00 2025-06-01 +# +# ============================================ +# TEST 11: Concurrent readers and writers +# Exercises: multiple connections with overlapping transactions, +# lock-free MVCC concurrency, separate per-connection txns. +# ============================================ +# +DELETE FROM stress_main WHERE id >= 100; +SELECT COUNT(*) AS base_cnt FROM stress_main; +base_cnt +7 +connect writer1, localhost, root,,; +connect writer2, localhost, root,,; +connect reader1, localhost, root,,; +connection writer1; +BEGIN; +INSERT INTO stress_main VALUES (1001, 'w1_a', 11); +connection writer2; +INSERT INTO stress_main VALUES (2001, 'w2_a', 22); +connection writer1; +INSERT INTO stress_main VALUES (1002, 'w1_b', 12); +connection writer2; +INSERT INTO stress_main VALUES (2002, 'w2_b', 23); +connection writer1; +connection writer2; +connection reader1; +SELECT COUNT(*) AS reader_sees FROM stress_main; +reader_sees +9 +connection writer1; +COMMIT; +connection writer2; +INSERT INTO stress_main VALUES (2003, 'w2_c', 24); +connection default; +SELECT COUNT(*) AS final_cnt FROM stress_main WHERE id >= 1000; +final_cnt +5 +disconnect writer1; +disconnect writer2; +disconnect reader1; +# +# ============================================ +# TEST 12: Concurrent transactions with rollback +# Exercises: one connection commits, another rolls back. +# ============================================ +# +connect conn_commit, localhost, root,,; +connect conn_rollback, localhost, root,,; +connection conn_commit; +BEGIN; +INSERT INTO stress_main VALUES (3001, 'will_commit', 31); +connection conn_rollback; +BEGIN; +INSERT INTO stress_main VALUES (4001, 'will_rollback', 41); +connection conn_commit; +INSERT INTO stress_main VALUES (3002, 'will_commit_2', 32); +connection conn_rollback; +INSERT INTO stress_main VALUES (4002, 'will_rollback_2', 42); +connection conn_commit; +COMMIT; +connection conn_rollback; +ROLLBACK; +connection default; +SELECT id, val FROM stress_main WHERE id IN (3001, 3002, 4001, 4002) ORDER BY id; +id val +3001 will_commit +3002 will_commit_2 +disconnect conn_commit; +disconnect conn_rollback; +# +# ============================================ +# TEST 13: Rapid open/close cycle -- exercises close() cleanup +# Multiple short-lived connections each doing a quick operation. +# ============================================ +# +connect rapid1, localhost, root,,; +connection rapid1; +SELECT COUNT(*) > 0 AS has_rows FROM stress_main; +has_rows +1 +disconnect rapid1; +connect rapid2, localhost, root,,; +connection rapid2; +INSERT INTO stress_main VALUES (5001, 'rapid', 50); +disconnect rapid2; +connect rapid3, localhost, root,,; +connection rapid3; +BEGIN; +INSERT INTO stress_main VALUES (5002, 'rapid_txn', 51); +COMMIT; +disconnect rapid3; +connection default; +SELECT COUNT(*) AS rapid_cnt FROM stress_main WHERE id IN (5001, 5002); +rapid_cnt +2 +# +# ============================================ +# TEST 14: INSERT...SELECT across TidesDB tables in transaction +# Exercises: read from one CF + write to another in same txn. +# ============================================ +# +TRUNCATE TABLE stress_wide; +BEGIN; +INSERT INTO stress_wide (id, c1, c2, c3, c4, c5, c6) +SELECT id, val, val, score, score * 10, score + 0.50, '2025-01-01' + FROM stress_main +WHERE id <= 8; +COMMIT; +SELECT COUNT(*) AS copied FROM stress_wide; +copied +7 +SELECT * FROM stress_wide ORDER BY id; +id c1 c2 c3 c4 c5 c6 +1 txn_row_1 txn_row_1 110 1100 110.50 2025-01-01 +2 updated_in_txn updated_in_txn 120 1200 120.50 2025-01-01 +3 autocommit_3 autocommit_3 130 1300 130.50 2025-01-01 +5 mixed_5 mixed_5 0 0 0.50 2025-01-01 +6 idx_6 idx_6 60 600 60.50 2025-01-01 +7 idx_7 idx_7 70 700 70.50 2025-01-01 +8 idx_8 idx_8 60 600 60.50 2025-01-01 +# +# ============================================ +# TEST 15: UPDATE that changes secondary index key +# Exercises: sec index delete(old) + insert(new) in update_row. +# ============================================ +# +SELECT id, score FROM stress_main WHERE id <= 5 ORDER BY id; +id score +1 110 +2 120 +3 130 +5 0 +BEGIN; +UPDATE stress_main SET score = score + 1000 WHERE id <= 5; +COMMIT; +SELECT id, score FROM stress_main WHERE score >= 1000 ORDER BY id; +id score +1 1110 +2 1120 +3 1130 +5 1000 +BEGIN; +UPDATE stress_main SET score = score - 1000 WHERE id <= 5; +COMMIT; +SELECT id, score FROM stress_main WHERE id <= 5 ORDER BY id; +id score +1 110 +2 120 +3 130 +5 0 +# +# ============================================ +# TEST 16: Concurrent bulk writers + reader +# Exercises: heavy concurrent write pressure from multiple +# connections, verifies no data corruption. +# ============================================ +# +CREATE TABLE stress_bulk (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +connect bulk1, localhost, root,,; +connect bulk2, localhost, root,,; +connect bulk3, localhost, root,,; +connection bulk1; +BEGIN; +connection bulk2; +BEGIN; +connection bulk1; +connection bulk2; +connection bulk1; +COMMIT; +connection bulk2; +COMMIT; +connection bulk1; +connection bulk2; +connection bulk3; +SELECT COUNT(*) AS bulk_total FROM stress_bulk; +bulk_total +200 +SELECT COUNT(DISTINCT id) AS unique_ids FROM stress_bulk; +unique_ids +200 +connection default; +disconnect bulk1; +disconnect bulk2; +disconnect bulk3; +DROP TABLE stress_bulk; +# +# ============================================ +# TEST 17: Repeated TRUNCATE + re-insert cycle +# Exercises: repeated CF drop/recreate, share->cf pointer +# update, txn discard before drop. +# ============================================ +# +CREATE TABLE stress_trunc (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +TRUNCATE TABLE stress_trunc; +TRUNCATE TABLE stress_trunc; +TRUNCATE TABLE stress_trunc; +TRUNCATE TABLE stress_trunc; +TRUNCATE TABLE stress_trunc; +SELECT COUNT(*) AS after_cycles FROM stress_trunc; +after_cycles +0 +INSERT INTO stress_trunc VALUES (1, 'final'); +SELECT * FROM stress_trunc; +id val +1 final +DROP TABLE stress_trunc; +# +# ============================================ +# TEST 18: Transaction with only reads (read-only txn path) +# Exercises: tidesdb_commit with dirty=false, rollback+reset path. +# ============================================ +# +BEGIN; +SELECT COUNT(*) AS ro_cnt FROM stress_main; +ro_cnt +16 +SELECT * FROM stress_main WHERE id = 1; +id val score +1 txn_row_1 110 +SELECT MIN(score) AS min_s, MAX(score) AS max_s FROM stress_main; +min_s max_s +0 130 +COMMIT; +# +# ============================================ +# TEST 19: PK uniqueness enforcement and REPLACE INTO +# Duplicate PK INSERT must return an error. +# REPLACE INTO overwrites the existing row. +# ============================================ +# +CREATE TABLE stress_uniq (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO stress_uniq VALUES (1, 'first'); +INSERT INTO stress_uniq VALUES (1, 'should_fail'); +ERROR 23000: Duplicate entry '1' for key 'PRIMARY' +REPLACE INTO stress_uniq VALUES (1, 'replaced'); +BEGIN; +INSERT INTO stress_uniq VALUES (2, 'second'); +REPLACE INTO stress_uniq VALUES (1, 'overwritten'); +INSERT INTO stress_uniq VALUES (3, 'third'); +COMMIT; +SELECT * FROM stress_uniq ORDER BY id; +id val +1 overwritten +2 second +3 third +DROP TABLE stress_uniq; +# +# ============================================ +# TEST 20: Verify data integrity after all stress +# Final consistency check on the main table. +# ============================================ +# +SELECT COUNT(*) AS total FROM stress_main; +total +16 +SELECT COUNT(*) AS idx_total FROM stress_main WHERE score >= 0 OR score < 0 OR score IS NULL; +idx_total +16 +# +# === Cleanup === +# +DROP TABLE stress_main; +DROP TABLE stress_nopk; +DROP TABLE stress_wide; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_tombstone_density.result b/mysql-test/suite/tidesdb/r/tidesdb_tombstone_density.result new file mode 100644 index 0000000000000..716a026a7a5db --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_tombstone_density.result @@ -0,0 +1,143 @@ +# +# === Table-level tombstone density options accept and persist === +# +CREATE TABLE t_td ( +pk BIGINT PRIMARY KEY, +c0 INT, +KEY (c0) +) ENGINE=TIDESDB TOMBSTONE_DENSITY_TRIGGER=5000 TOMBSTONE_DENSITY_MIN_ENTRIES=512; +SELECT LOCATE('TOMBSTONE_DENSITY_TRIGGER', CREATE_OPTIONS) > 0 AS has_trigger +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; +has_trigger +1 +SELECT LOCATE('=5000', CREATE_OPTIONS) > 0 AS trigger_value +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; +trigger_value +1 +SELECT LOCATE('=512', CREATE_OPTIONS) > 0 AS min_entries_value +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; +min_entries_value +1 +ALTER TABLE t_td TOMBSTONE_DENSITY_TRIGGER=2000; +SELECT LOCATE('=2000', CREATE_OPTIONS) > 0 AS new_value +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; +new_value +1 +DROP TABLE t_td; +# +# === Session-default inheritance === +# +SET SESSION tidesdb_default_tombstone_density_trigger = 4000; +SET SESSION tidesdb_default_tombstone_density_min_entries = 256; +CREATE TABLE t_default_td (pk BIGINT PRIMARY KEY, c0 INT) ENGINE=TIDESDB; +SELECT LOCATE('=4000', CREATE_OPTIONS) > 0 AS inherits_trigger +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_default_td'; +inherits_trigger +1 +SELECT LOCATE('=256', CREATE_OPTIONS) > 0 AS inherits_min +FROM information_schema.TABLES +WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_default_td'; +inherits_min +1 +DROP TABLE t_default_td; +SET SESSION tidesdb_default_tombstone_density_trigger = DEFAULT; +SET SESSION tidesdb_default_tombstone_density_min_entries = DEFAULT; +# +# === Auto compact-after-range-delete session variable === +# +SHOW VARIABLES LIKE 'tidesdb_compact_after_range_delete_min_rows'; +Variable_name Value +tidesdb_compact_after_range_delete_min_rows 0 +CREATE TABLE t_auto ( +pk BIGINT PRIMARY KEY, +c0 INT, +c1 INT, +KEY (c0), +KEY (c1) +) ENGINE=TIDESDB; +INSERT INTO t_auto (pk,c0,c1) VALUES +(1,0,2),(2,1,4),(3,2,6),(4,3,8),(5,4,10), +(6,5,12),(7,6,14),(8,7,16),(9,8,18),(10,9,20), +(11,0,22),(12,1,24),(13,2,26),(14,3,28),(15,4,30), +(16,5,32),(17,6,34),(18,7,36),(19,8,38),(20,9,40), +(21,0,42),(22,1,44),(23,2,46),(24,3,48),(25,4,50), +(26,5,52),(27,6,54),(28,7,56),(29,8,58),(30,9,60), +(31,0,62),(32,1,64),(33,2,66),(34,3,68),(35,4,70), +(36,5,72),(37,6,74),(38,7,76),(39,8,78),(40,9,80), +(41,0,82),(42,1,84),(43,2,86),(44,3,88),(45,4,90), +(46,5,92),(47,6,94),(48,7,96),(49,8,98),(50,9,100); +INSERT INTO t_auto (pk,c0,c1) VALUES +(51,0,102),(52,1,104),(53,2,106),(54,3,108),(55,4,110), +(56,5,112),(57,6,114),(58,7,116),(59,8,118),(60,9,120), +(61,0,122),(62,1,124),(63,2,126),(64,3,128),(65,4,130), +(66,5,132),(67,6,134),(68,7,136),(69,8,138),(70,9,140), +(71,0,142),(72,1,144),(73,2,146),(74,3,148),(75,4,150), +(76,5,152),(77,6,154),(78,7,156),(79,8,158),(80,9,160), +(81,0,162),(82,1,164),(83,2,166),(84,3,168),(85,4,170), +(86,5,172),(87,6,174),(88,7,176),(89,8,178),(90,9,180), +(91,0,182),(92,1,184),(93,2,186),(94,3,188),(95,4,190), +(96,5,192),(97,6,194),(98,7,196),(99,8,198),(100,9,200); +SELECT COUNT(*) FROM t_auto; +COUNT(*) +100 +# threshold below the deleted-row count, auto compact fires silently. +# We assert reads remain correct after the synchronous compaction. +SET SESSION tidesdb_compact_after_range_delete_min_rows = 20; +DELETE FROM t_auto WHERE pk BETWEEN 30 AND 70; +SELECT COUNT(*) FROM t_auto; +COUNT(*) +59 +SELECT pk FROM t_auto WHERE pk BETWEEN 28 AND 32 ORDER BY pk; +pk +28 +29 +SELECT pk FROM t_auto WHERE pk BETWEEN 68 AND 72 ORDER BY pk; +pk +71 +72 +SELECT pk FROM t_auto WHERE c0 = 5 AND pk < 70 ORDER BY pk; +pk +6 +16 +26 +SELECT pk FROM t_auto WHERE c1 = 134; +pk +# threshold above the deleted-row count, auto compact does NOT fire. +SET SESSION tidesdb_compact_after_range_delete_min_rows = 1000000; +DELETE FROM t_auto WHERE pk BETWEEN 75 AND 79; +SELECT COUNT(*) FROM t_auto; +COUNT(*) +54 +SELECT pk FROM t_auto WHERE pk BETWEEN 73 AND 81 ORDER BY pk; +pk +73 +74 +80 +81 +SET SESSION tidesdb_compact_after_range_delete_min_rows = DEFAULT; +DROP TABLE t_auto; +# +# === Tombstone status variables exist and are non-negative === +# +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS total +FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_TOTAL_TOMBSTONES'; +total +ok +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS ratio +FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_TOMBSTONE_RATIO'; +ratio +ok +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS density +FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_MAX_SST_TOMBSTONE_DENSITY'; +density +ok +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS density_level +FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_MAX_SST_TOMBSTONE_DENSITY_LEVEL'; +density_level +ok +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_tpcc_contention.result b/mysql-test/suite/tidesdb/r/tidesdb_tpcc_contention.result new file mode 100644 index 0000000000000..bbb50d288f0d9 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_tpcc_contention.result @@ -0,0 +1,108 @@ +# +# === Setup: TPC-C district table (simplified) === +# +CREATE TABLE district ( +d_w_id INT NOT NULL, +d_id INT NOT NULL, +d_next_o_id INT NOT NULL, +d_tax DECIMAL(4,4), +PRIMARY KEY (d_w_id, d_id) +) ENGINE=TIDESDB; +INSERT INTO district VALUES (1, 1, 3001, 0.1000); +CREATE TABLE orders ( +o_id INT NOT NULL, +o_w_id INT NOT NULL, +o_d_id INT NOT NULL, +o_c_id INT NOT NULL, +PRIMARY KEY (o_w_id, o_d_id, o_id) +) ENGINE=TIDESDB; +CREATE TABLE new_order ( +no_w_id INT NOT NULL, +no_d_id INT NOT NULL, +no_o_id INT NOT NULL, +PRIMARY KEY (no_w_id, no_d_id, no_o_id) +) ENGINE=TIDESDB; +# +# === TEST 1: Single-session NEWORD (baseline) === +# +BEGIN; +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1 FOR UPDATE; +d_next_o_id +3001 +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +INSERT INTO orders VALUES (3001, 1, 1, 42); +INSERT INTO new_order VALUES (1, 1, 3001); +COMMIT; +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +3002 +# +# === TEST 2: Two concurrent UPDATEs on same district row === +# With pessimistic_locking=ON, the second UPDATE blocks on the +# row lock until the first commits. Both succeed, counter +# increments by 2 with no conflicts and no lost updates. +# +connect connA, localhost, root,,; +connect connB, localhost, root,,; +connection connA; +BEGIN; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection connB; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection connA; +COMMIT; +connection connB; +connection default; +# Both UPDATEs succeeded: 3002 + 2 = 3004 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +3004 +# +# === TEST 3: Serial counter increment (10 iterations) === +# Verify the counter works correctly when serialized. +# +# Should be initial(3004) + 10 = 3014 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +3014 +# +# === TEST 4: 4 concurrent autocommit UPDATEs on same row === +# With pessimistic_locking=ON, all 4 serialize through the row lock. +# Counter should advance by exactly 4. +# +UPDATE district SET d_next_o_id = 5001 WHERE d_w_id=1 AND d_id=1; +connect storm1, localhost, root,,; +connect storm2, localhost, root,,; +connect storm3, localhost, root,,; +connect storm4, localhost, root,,; +connection storm1; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm2; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm3; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm4; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm1; +connection storm2; +connection storm3; +connection storm4; +connection default; +# All 4 UPDATEs succeeded through serialized row locks: 5001 + 4 = 5005 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; +d_next_o_id +5005 +# +# === Cleanup === +# +disconnect connA; +disconnect connB; +disconnect storm1; +disconnect storm2; +disconnect storm3; +disconnect storm4; +connection default; +DROP TABLE district; +DROP TABLE orders; +DROP TABLE new_order; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_ttl.result b/mysql-test/suite/tidesdb/r/tidesdb_ttl.result new file mode 100644 index 0000000000000..dc297b43fa555 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_ttl.result @@ -0,0 +1,199 @@ +# +# ============================================ +# TEST 1: Table-level TTL (short expiration) +# ============================================ +# +CREATE TABLE t_ttl_table ( +id INT PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB TTL=8; +INSERT INTO t_ttl_table VALUES (1, 'alpha'), (2, 'beta'), (3, 'gamma'); +# Rows should be visible immediately +SELECT * FROM t_ttl_table ORDER BY id; +id val +1 alpha +2 beta +3 gamma +# Wait for TTL to expire (3 seconds > 2 second TTL) +# Rows should now be expired (empty result) +SELECT * FROM t_ttl_table ORDER BY id; +id val +DROP TABLE t_ttl_table; +# +# ============================================ +# TEST 2: Per-row TTL via TTL_COL field option +# ============================================ +# +CREATE TABLE t_ttl_col ( +id INT PRIMARY KEY, +val VARCHAR(50), +expire_secs INT `TTL`=1 +) ENGINE=TIDESDB; +INSERT INTO t_ttl_col VALUES (1, 'short', 8), (2, 'long', 86400), (3, 'forever', 0); +# All three rows visible immediately +SELECT id, val FROM t_ttl_col ORDER BY id; +id val +1 short +2 long +3 forever +# Wait for the short TTL to expire +# Row 1 should be expired; rows 2 and 3 remain +SELECT id, val FROM t_ttl_col ORDER BY id; +id val +2 long +3 forever +DROP TABLE t_ttl_col; +# +# ============================================ +# TEST 3: Per-row TTL overrides table default +# ============================================ +# +CREATE TABLE t_ttl_override ( +id INT PRIMARY KEY, +val VARCHAR(50), +ttl_val INT `TTL`=1 +) ENGINE=TIDESDB TTL=86400; +INSERT INTO t_ttl_override VALUES (1, 'short_override', 8), (2, 'uses_default', 0); +# Both rows visible immediately +SELECT id, val FROM t_ttl_override ORDER BY id; +id val +1 short_override +2 uses_default +# Row 1 expired (per-row TTL=2 overrode default); row 2 still alive (table TTL=86400) +SELECT id, val FROM t_ttl_override ORDER BY id; +id val +2 uses_default +DROP TABLE t_ttl_override; +# +# ============================================ +# TEST 4: TTL=0 means no expiration (default) +# ============================================ +# +CREATE TABLE t_ttl_none ( +id INT PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB TTL=0; +INSERT INTO t_ttl_none VALUES (1, 'permanent'); +# Row should still be present (TTL=0 = no expiration) +SELECT * FROM t_ttl_none ORDER BY id; +id val +1 permanent +DROP TABLE t_ttl_none; +# +# ============================================ +# TEST 5: TTL with UPDATE refreshes expiration +# ============================================ +# +CREATE TABLE t_ttl_update ( +id INT PRIMARY KEY, +val VARCHAR(50), +ttl_s INT `TTL`=1 +) ENGINE=TIDESDB; +INSERT INTO t_ttl_update VALUES (1, 'original', 8); +# Row visible immediately +SELECT id, val FROM t_ttl_update ORDER BY id; +id val +1 original +# UPDATE resets TTL to 5 more seconds +UPDATE t_ttl_update SET val = 'refreshed', ttl_s = 30 WHERE id = 1; +# Row should still be alive (UPDATE refreshed TTL at ~1s, now at ~3s, TTL=5s) +SELECT id, val FROM t_ttl_update ORDER BY id; +id val +1 refreshed +DROP TABLE t_ttl_update; +# +# ============================================ +# TEST 6: SHOW CREATE TABLE shows TTL options +# ============================================ +# +CREATE TABLE t_ttl_show ( +id INT PRIMARY KEY, +val VARCHAR(50), +row_ttl INT `TTL`=1 +) ENGINE=TIDESDB TTL=3600; +SHOW CREATE TABLE t_ttl_show; +Table Create Table +t_ttl_show CREATE TABLE `t_ttl_show` ( + `id` int(11) NOT NULL, + `val` varchar(50) DEFAULT NULL, + `row_ttl` int(11) DEFAULT NULL `TTL`=1, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci `TTL`=3600 +DROP TABLE t_ttl_show; +# +# ============================================ +# TEST 7: Session TTL override (SET SESSION) +# Table has no TTL; session variable applies +# ============================================ +# +CREATE TABLE t_ttl_sess ( +id INT PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB; +# Default session TTL is 0 (no override) +SELECT @@session.tidesdb_ttl; +@@session.tidesdb_ttl +0 +SET SESSION tidesdb_ttl = 8; +INSERT INTO t_ttl_sess VALUES (1, 'session_ttl'), (2, 'also_session'); +# Rows visible immediately +SELECT * FROM t_ttl_sess ORDER BY id; +id val +1 session_ttl +2 also_session +SET SESSION tidesdb_ttl = 0; +# Wait for session TTL to expire (3s > 2s) +# Rows should now be expired +SELECT * FROM t_ttl_sess ORDER BY id; +id val +DROP TABLE t_ttl_sess; +# +# ============================================ +# TEST 8: SET STATEMENT tidesdb_ttl=N FOR ... +# Only the single statement gets TTL +# ============================================ +# +CREATE TABLE t_ttl_stmt ( +id INT PRIMARY KEY, +val VARCHAR(50) +) ENGINE=TIDESDB; +SET STATEMENT tidesdb_ttl=8 FOR +INSERT INTO t_ttl_stmt VALUES (1, 'short_lived'); +INSERT INTO t_ttl_stmt VALUES (2, 'permanent'); +# Both rows visible immediately +SELECT * FROM t_ttl_stmt ORDER BY id; +id val +1 short_lived +2 permanent +# Row 1 expired (session TTL=2); row 2 still alive (no TTL) +SELECT * FROM t_ttl_stmt ORDER BY id; +id val +2 permanent +DROP TABLE t_ttl_stmt; +# +# ============================================ +# TEST 9: Session TTL does NOT override per-row TTL_COL +# ============================================ +# +CREATE TABLE t_ttl_priority ( +id INT PRIMARY KEY, +val VARCHAR(50), +row_ttl INT `TTL`=1 +) ENGINE=TIDESDB; +SET SESSION tidesdb_ttl = 86400; +INSERT INTO t_ttl_priority VALUES (1, 'per_row_wins', 8); +INSERT INTO t_ttl_priority VALUES (2, 'uses_session', 0); +SET SESSION tidesdb_ttl = 0; +# Both visible immediately +SELECT id, val FROM t_ttl_priority ORDER BY id; +id val +1 per_row_wins +2 uses_session +# Row 1 expired (per-row TTL=2 wins); row 2 still alive (session TTL=86400) +SELECT id, val FROM t_ttl_priority ORDER BY id; +id val +2 uses_session +DROP TABLE t_ttl_priority; +# +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_unified_memtable.result b/mysql-test/suite/tidesdb/r/tidesdb_unified_memtable.result new file mode 100644 index 0000000000000..bf887bda2c4ee --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_unified_memtable.result @@ -0,0 +1,91 @@ +# +# TEST 1: Verify unified memtable is ON +# +SELECT @@tidesdb_unified_memtable AS unified; +unified +1 +# +# TEST 2: Multiple tables sharing the unified memtable +# +CREATE TABLE t_um1 (id INT PRIMARY KEY, v VARCHAR(100)) ENGINE=TidesDB; +CREATE TABLE t_um2 (id INT PRIMARY KEY, v VARCHAR(100)) ENGINE=TidesDB; +CREATE TABLE t_um3 (id INT PRIMARY KEY, v VARCHAR(100)) ENGINE=TidesDB; +BEGIN; +INSERT INTO t_um1 VALUES (1, 'table1_row1'); +INSERT INTO t_um2 VALUES (1, 'table2_row1'); +INSERT INTO t_um3 VALUES (1, 'table3_row1'); +COMMIT; +SELECT * FROM t_um1; +id v +1 table1_row1 +SELECT * FROM t_um2; +id v +1 table2_row1 +SELECT * FROM t_um3; +id v +1 table3_row1 +# +# TEST 3: Cross-table transaction atomicity +# +BEGIN; +INSERT INTO t_um1 VALUES (2, 'committed'); +INSERT INTO t_um2 VALUES (2, 'committed'); +INSERT INTO t_um3 VALUES (2, 'committed'); +COMMIT; +BEGIN; +INSERT INTO t_um1 VALUES (3, 'rolled_back'); +INSERT INTO t_um2 VALUES (3, 'rolled_back'); +ROLLBACK; +SELECT COUNT(*) AS t1_rows FROM t_um1; +t1_rows +2 +SELECT COUNT(*) AS t2_rows FROM t_um2; +t2_rows +2 +SELECT COUNT(*) AS t3_rows FROM t_um3; +t3_rows +2 +# +# TEST 4: Bulk write across tables (stresses unified WAL) +# +SELECT COUNT(*) AS t1_total FROM t_um1; +t1_total +43 +SELECT COUNT(*) AS t2_total FROM t_um2; +t2_total +43 +# +# TEST 5: OPTIMIZE TABLE with unified memtable +# +OPTIMIZE TABLE t_um1; +Table Op Msg_type Msg_text +test.t_um1 optimize status OK +OPTIMIZE TABLE t_um2; +Table Op Msg_type Msg_text +test.t_um2 optimize status OK +SELECT COUNT(*) AS after_optimize FROM t_um1; +after_optimize +43 +# +# TEST 6: Secondary indexes across multiple CFs in unified mode +# +CREATE TABLE t_um_idx ( +id INT PRIMARY KEY, +a INT, +b INT, +KEY(a), +KEY(b) +) ENGINE=TidesDB; +INSERT INTO t_um_idx VALUES (1, 10, 100), (2, 20, 200), (3, 10, 300); +SELECT id FROM t_um_idx WHERE a = 10 ORDER BY id; +id +1 +3 +SELECT id FROM t_um_idx WHERE b = 200; +id +2 +# +# Cleanup +# +DROP TABLE t_um1, t_um2, t_um3, t_um_idx; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_update_unique.result b/mysql-test/suite/tidesdb/r/tidesdb_update_unique.result new file mode 100644 index 0000000000000..fdb1d446f06b3 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_update_unique.result @@ -0,0 +1,57 @@ +# --- PRIMARY KEY collision --- +CREATE TABLE t1 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t1 VALUES (1,10),(2,20); +UPDATE t1 SET id=2 WHERE id=1; +ERROR 23000: Duplicate entry '2' for key 'PRIMARY' +# Both rows must survive the rejected UPDATE +SELECT * FROM t1 ORDER BY id; +id v +1 10 +2 20 +# A non-colliding move still succeeds +UPDATE t1 SET id=3 WHERE id=1; +SELECT * FROM t1 ORDER BY id; +id v +2 20 +3 10 +DROP TABLE t1; +# --- UNIQUE secondary collision --- +CREATE TABLE t2 (id INT PRIMARY KEY, e VARCHAR(20), v INT, UNIQUE KEY(e)) ENGINE=TidesDB; +INSERT INTO t2 VALUES (1,'a',10),(2,'b',20); +UPDATE t2 SET e='b' WHERE id=1; +ERROR 23000: Duplicate entry 'b' for key 'e' +# No duplicate 'b' may exist after the rejected UPDATE +SELECT * FROM t2 ORDER BY id; +id e v +1 a 10 +2 b 20 +# Updating the unique column to a fresh value succeeds +UPDATE t2 SET e='c' WHERE id=1; +SELECT * FROM t2 ORDER BY id; +id e v +1 c 10 +2 b 20 +# Updating a non-indexed column leaves the unique value in place +UPDATE t2 SET v=99 WHERE id=1; +SELECT * FROM t2 ORDER BY id; +id e v +1 c 99 +2 b 20 +DROP TABLE t2; +# --- changing only the PK keeps a stable unique value valid --- +CREATE TABLE t3 (id INT PRIMARY KEY, e VARCHAR(20), UNIQUE KEY(e)) ENGINE=TidesDB; +INSERT INTO t3 VALUES (1,'x'),(2,'y'); +# moving id 1 to 3 keeps e='x' unique to that row, must succeed +UPDATE t3 SET id=3 WHERE id=1; +SELECT * FROM t3 ORDER BY id; +id e +2 y +3 x +DROP TABLE t3; +# --- tidesdb_skip_unique_check bypasses enforcement by contract --- +CREATE TABLE t4 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t4 VALUES (1,10),(2,20); +SET SESSION tidesdb_skip_unique_check=1; +UPDATE t4 SET id=2 WHERE id=1; +SET SESSION tidesdb_skip_unique_check=DEFAULT; +DROP TABLE t4; diff --git a/mysql-test/suite/tidesdb/r/tidesdb_vcol.result b/mysql-test/suite/tidesdb/r/tidesdb_vcol.result new file mode 100644 index 0000000000000..fe19bbeec3f4f --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_vcol.result @@ -0,0 +1,197 @@ +# +# ============================================ +# TEST 1: VIRTUAL generated column +# ============================================ +# +CREATE TABLE t_vcol ( +id INT PRIMARY KEY, +price DECIMAL(10,2), +qty INT, +total DECIMAL(10,2) AS (price * qty) VIRTUAL +) ENGINE=TIDESDB; +INSERT INTO t_vcol (id, price, qty) VALUES (1, 10.50, 3); +INSERT INTO t_vcol (id, price, qty) VALUES (2, 25.00, 2); +INSERT INTO t_vcol (id, price, qty) VALUES (3, 5.75, 10); +# Virtual column 'total' should be computed on read +SELECT * FROM t_vcol ORDER BY id; +id price qty total +1 10.50 3 31.50 +2 25.00 2 50.00 +3 5.75 10 57.50 +# Update base column and verify virtual column recalculates +UPDATE t_vcol SET qty = 5 WHERE id = 1; +SELECT id, price, qty, total FROM t_vcol WHERE id = 1; +id price qty total +1 10.50 5 52.50 +DROP TABLE t_vcol; +# +# ============================================ +# TEST 2: STORED (PERSISTENT) generated column +# ============================================ +# +CREATE TABLE t_scol ( +id INT PRIMARY KEY, +first_name VARCHAR(50), +last_name VARCHAR(50), +full_name VARCHAR(101) AS (CONCAT(first_name, ' ', last_name)) PERSISTENT +) ENGINE=TIDESDB; +INSERT INTO t_scol (id, first_name, last_name) VALUES (1, 'John', 'Doe'); +INSERT INTO t_scol (id, first_name, last_name) VALUES (2, 'Jane', 'Smith'); +SELECT * FROM t_scol ORDER BY id; +id first_name last_name full_name +1 John Doe John Doe +2 Jane Smith Jane Smith +# Update base column and verify stored column updates +UPDATE t_scol SET last_name = 'Johnson' WHERE id = 1; +SELECT * FROM t_scol WHERE id = 1; +id first_name last_name full_name +1 John Johnson John Johnson +DROP TABLE t_scol; +# +# ============================================ +# TEST 3: Multiple virtual columns +# ============================================ +# +CREATE TABLE t_multi_vcol ( +id INT PRIMARY KEY, +radius DOUBLE, +area DOUBLE AS (PI() * radius * radius) VIRTUAL, +circumference DOUBLE AS (2 * PI() * radius) VIRTUAL, +diameter DOUBLE AS (2 * radius) VIRTUAL +) ENGINE=TIDESDB; +INSERT INTO t_multi_vcol (id, radius) VALUES (1, 5.0); +INSERT INTO t_multi_vcol (id, radius) VALUES (2, 10.0); +SELECT id, radius, ROUND(area, 2) AS area, ROUND(circumference, 2) AS circ, diameter +FROM t_multi_vcol ORDER BY id; +id radius area circ diameter +1 5 78.54 31.42 10 +2 10 314.16 62.83 20 +DROP TABLE t_multi_vcol; +# +# ============================================ +# TEST 4: Virtual column with conditional expression +# ============================================ +# +CREATE TABLE t_vcol_cond ( +id INT PRIMARY KEY, +score INT, +grade VARCHAR(10) AS ( +CASE +WHEN score >= 90 THEN 'A' + WHEN score >= 80 THEN 'B' + WHEN score >= 70 THEN 'C' + WHEN score >= 60 THEN 'D' + ELSE 'F' + END +) VIRTUAL +) ENGINE=TIDESDB; +INSERT INTO t_vcol_cond (id, score) VALUES (1, 95), (2, 82), (3, 71), (4, 55); +SELECT * FROM t_vcol_cond ORDER BY id; +id score grade +1 95 A +2 82 B +3 71 C +4 55 F +# Update score and verify grade recalculates +UPDATE t_vcol_cond SET score = 91 WHERE id = 4; +SELECT * FROM t_vcol_cond WHERE id = 4; +id score grade +4 91 A +DROP TABLE t_vcol_cond; +# +# ============================================ +# TEST 5: Mixed virtual and stored columns +# ============================================ +# +CREATE TABLE t_mixed ( +id INT PRIMARY KEY, +a INT, +b INT, +sum_ab INT AS (a + b) PERSISTENT, +product_ab INT AS (a * b) VIRTUAL, +diff_ab INT AS (a - b) VIRTUAL +) ENGINE=TIDESDB; +INSERT INTO t_mixed (id, a, b) VALUES (1, 10, 3), (2, 7, 4), (3, 15, 8); +SELECT * FROM t_mixed ORDER BY id; +id a b sum_ab product_ab diff_ab +1 10 3 13 30 7 +2 7 4 11 28 3 +3 15 8 23 120 7 +UPDATE t_mixed SET a = 20 WHERE id = 2; +SELECT * FROM t_mixed WHERE id = 2; +id a b sum_ab product_ab diff_ab +2 20 4 24 80 16 +DROP TABLE t_mixed; +# +# ============================================ +# TEST 6: Virtual column with string functions +# ============================================ +# +CREATE TABLE t_vcol_str ( +id INT PRIMARY KEY, +email VARCHAR(100), +domain VARCHAR(100) AS (SUBSTRING_INDEX(email, '@', -1)) VIRTUAL, +username VARCHAR(100) AS (SUBSTRING_INDEX(email, '@', 1)) VIRTUAL +) ENGINE=TIDESDB; +INSERT INTO t_vcol_str (id, email) VALUES +(1, 'alice@example.com'), +(2, 'bob@gmail.com'), +(3, 'charlie@company.org'); +SELECT * FROM t_vcol_str ORDER BY id; +id email domain username +1 alice@example.com example.com alice +2 bob@gmail.com gmail.com bob +3 charlie@company.org company.org charlie +# Verify WHERE clause on virtual column works +SELECT id, email FROM t_vcol_str WHERE domain = 'gmail.com'; +id email +2 bob@gmail.com +DROP TABLE t_vcol_str; +# +# ============================================ +# TEST 7: Virtual column with DELETE +# ============================================ +# +CREATE TABLE t_vcol_del ( +id INT PRIMARY KEY, +val INT, +doubled INT AS (val * 2) VIRTUAL +) ENGINE=TIDESDB; +INSERT INTO t_vcol_del (id, val) VALUES (1, 10), (2, 20), (3, 30); +SELECT * FROM t_vcol_del ORDER BY id; +id val doubled +1 10 20 +2 20 40 +3 30 60 +DELETE FROM t_vcol_del WHERE id = 2; +SELECT * FROM t_vcol_del ORDER BY id; +id val doubled +1 10 20 +3 30 60 +DROP TABLE t_vcol_del; +# +# ============================================ +# TEST 8: SHOW CREATE TABLE with virtual columns +# ============================================ +# +CREATE TABLE t_vcol_show ( +id INT PRIMARY KEY, +a INT, +b INT, +v_sum INT AS (a + b) VIRTUAL, +s_prod INT AS (a * b) PERSISTENT +) ENGINE=TIDESDB; +SHOW CREATE TABLE t_vcol_show; +Table Create Table +t_vcol_show CREATE TABLE `t_vcol_show` ( + `id` int(11) NOT NULL, + `a` int(11) DEFAULT NULL, + `b` int(11) DEFAULT NULL, + `v_sum` int(11) GENERATED ALWAYS AS (`a` + `b`) VIRTUAL, + `s_prod` int(11) GENERATED ALWAYS AS (`a` * `b`) STORED, + PRIMARY KEY (`id`) +) ENGINE=TidesDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci +DROP TABLE t_vcol_show; +# +# +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_vector.result b/mysql-test/suite/tidesdb/r/tidesdb_vector.result new file mode 100644 index 0000000000000..6ac008c611131 --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_vector.result @@ -0,0 +1,89 @@ +# +# Setup +# +CREATE TABLE docs ( +id INT NOT NULL PRIMARY KEY, +title VARCHAR(100), +v VECTOR(4) NOT NULL, +VECTOR INDEX (v) +) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'origin-x', Vec_FromText('[1.0, 0.0, 0.0, 0.0]')); +INSERT INTO docs VALUES (2, 'origin-y', Vec_FromText('[0.0, 1.0, 0.0, 0.0]')); +INSERT INTO docs VALUES (3, 'origin-z', Vec_FromText('[0.0, 0.0, 1.0, 0.0]')); +INSERT INTO docs VALUES (4, 'near-x', Vec_FromText('[0.9, 0.1, 0.0, 0.0]')); +INSERT INTO docs VALUES (5, 'center', Vec_FromText('[0.5, 0.5, 0.5, 0.5]')); +# +# TEST 1: Euclidean ANN search +# +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; +id title +1 origin-x +4 near-x +5 center +# +# TEST 2: Cosine ANN search +# +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_COSINE(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; +id title +1 origin-x +4 near-x +5 center +# +# TEST 3: UPDATE vector column +# +UPDATE docs SET v = Vec_FromText('[0.95, 0.05, 0.0, 0.0]') WHERE id = 4; +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; +id title +1 origin-x +4 near-x +5 center +# +# TEST 4: DELETE vector row +# +DELETE FROM docs WHERE id = 1; +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; +id title +4 near-x +5 center +2 origin-y +# +# TEST 5: UPDATE non-vector column +# +UPDATE docs SET title = 'renamed-near-x' WHERE id = 4; +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 2; +id title +4 renamed-near-x +5 center +# +# TEST 6: Different dimensionality +# +DROP TABLE docs; +CREATE TABLE docs ( +id INT NOT NULL PRIMARY KEY, +v VECTOR(3) NOT NULL, +VECTOR INDEX (v) +) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, Vec_FromText('[1.0, 0.0, 0.0]')); +INSERT INTO docs VALUES (2, Vec_FromText('[0.0, 1.0, 0.0]')); +INSERT INTO docs VALUES (3, Vec_FromText('[0.0, 0.0, 1.0]')); +SELECT id FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[0.9, 0.1, 0.0]')) +LIMIT 2; +id +1 +2 +# +# Cleanup +# +DROP TABLE docs; +# Done. diff --git a/mysql-test/suite/tidesdb/r/tidesdb_write_pressure.result b/mysql-test/suite/tidesdb/r/tidesdb_write_pressure.result new file mode 100644 index 0000000000000..d8dcc46f1105b --- /dev/null +++ b/mysql-test/suite/tidesdb/r/tidesdb_write_pressure.result @@ -0,0 +1,85 @@ +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_LOCKED"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_MEMORY_LIMIT"); +call mtr.add_suppression("\\[TIDESDB\\].*unexpected TidesDB error"); +# +# === Setup: sysbench-like schema with SYNC_MODE=NONE === +# +CREATE TABLE sbtest1 ( +id INT NOT NULL AUTO_INCREMENT, +k INT NOT NULL DEFAULT 0, +c CHAR(120) NOT NULL DEFAULT '', +pad CHAR(60) NOT NULL DEFAULT '', +PRIMARY KEY (id), +KEY k_1 (k) +) ENGINE=TIDESDB SYNC_MODE='NONE'; +CREATE TABLE sbtest2 ( +id INT NOT NULL AUTO_INCREMENT, +k INT NOT NULL DEFAULT 0, +c CHAR(120) NOT NULL DEFAULT '', +pad CHAR(60) NOT NULL DEFAULT '', +PRIMARY KEY (id), +KEY k_1 (k) +) ENGINE=TIDESDB SYNC_MODE='NONE'; +# +# === Populate: 5000 rows per table === +# +SELECT COUNT(*) AS sbtest1_rows FROM sbtest1; +sbtest1_rows +5000 +SELECT COUNT(*) AS sbtest2_rows FROM sbtest2; +sbtest2_rows +5000 +# +# ============================================ +# TEST 1: Single-connection write-only storm +# 1000 write-only transactions on one connection. +# Exercises rapid txn_begin/commit/free cycling. +# ============================================ +# +SELECT COUNT(*) AS after_single FROM sbtest1; +after_single +5000 +# +# ============================================ +# TEST 2: Concurrent write-only storm (4 connections) +# Each connection runs 500 write-only transactions +# hitting both tables. Conflicts are expected. +# ============================================ +# +connect wr1, localhost, root,,; +connect wr2, localhost, root,,; +connect wr3, localhost, root,,; +connect wr4, localhost, root,,; +connection default; +# +# === Verify data integrity after concurrent writes === +# +PK/index consistency: OK +# +# ============================================ +# TEST 3: Rapid txn churn (commit + immediate new txn) +# 1000 tiny autocommit writes per connection x 4 connections +# Tests rapid txn_begin/txn_free cycling without BEGIN/COMMIT +# ============================================ +# +# +# ============================================ +# TEST 4: Conflict storm -- all 4 connections hit same rows +# Maximizes TDB_ERR_CONFLICT / ERROR 1180 rate. +# Exercises the failed-commit -> txn_free -> new txn_begin path. +# ============================================ +# +connection default; +SELECT COUNT(*) FROM sbtest1 WHERE id IN (1, 2, 3); +Conflict storm: OK +# +# === Cleanup === +# +disconnect wr1; +disconnect wr2; +disconnect wr3; +disconnect wr4; +DROP TABLE sbtest1; +DROP TABLE sbtest2; +# Done. diff --git a/mysql-test/suite/tidesdb/suite.opt b/mysql-test/suite/tidesdb/suite.opt new file mode 100644 index 0000000000000..3027c6b021de6 --- /dev/null +++ b/mysql-test/suite/tidesdb/suite.opt @@ -0,0 +1,2 @@ +--plugin-load-add=$HA_TIDESDB_SO +--plugin-maturity=unknown \ No newline at end of file diff --git a/mysql-test/suite/tidesdb/t/tidesdb_alter_large_table.test b/mysql-test/suite/tidesdb/t/tidesdb_alter_large_table.test new file mode 100644 index 0000000000000..0f7f1dfa2ab5c --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_alter_large_table.test @@ -0,0 +1,54 @@ +--source include/have_tidesdb.inc +--echo # +--echo # Large-table ALTER under REPEATABLE_READ. +--echo # +--echo # Copy-phase ALTER scans every row of the source table into the +--echo # rebuilt table while a single REPEATABLE_READ transaction is open +--echo # (autocommit=0 forces this), so the engine must keep the read-set +--echo # bookkeeping bounded as the scan grows. Unbounded growth here +--echo # used to crash the server inside tidesdb_txn_add_to_read_set. +--echo # The test asserts that the scan completes, the rebuild commits, +--echo # and the row count is preserved. +--echo # + +CREATE TABLE t_alter_big ( + a INT AUTO_INCREMENT PRIMARY KEY, + b INT +) ENGINE=TidesDB; + +INSERT INTO t_alter_big (a, b) VALUES (DEFAULT, 10), (DEFAULT, 20), (DEFAULT, 30); + +--echo # Double the rows repeatedly to get ~100K rows +let $i = 15; +while ($i) +{ + INSERT INTO t_alter_big (b) SELECT b FROM t_alter_big; + dec $i; +} + +SELECT COUNT(*) FROM t_alter_big; + +--echo # autocommit=0 makes the surrounding session use REPEATABLE_READ, +--echo # which is the isolation that loaded the read-set during ALTER. +SET autocommit=0; + +--echo # Sanity-check ALTER's error reporting on contradictory key DDL. +--error ER_MULTIPLE_PRI_KEY +ALTER TABLE t_alter_big ADD PRIMARY KEY (a); + +--error ER_WRONG_AUTO_KEY +ALTER TABLE t_alter_big DROP PRIMARY KEY; + +--echo # Copy-based ALTER over ~100K rows under REPEATABLE_READ. Must +--echo # complete cleanly without exhausting memory or crashing the +--echo # server in the read-set machinery. +ALTER TABLE t_alter_big DROP PRIMARY KEY, CHANGE a a INT; + +SELECT COUNT(*) FROM t_alter_big; + +SET autocommit=1; +DROP TABLE t_alter_big; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_analyze.opt b/mysql-test/suite/tidesdb/t/tidesdb_analyze.opt new file mode 100644 index 0000000000000..83434125bd516 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_analyze.opt @@ -0,0 +1 @@ +--loose-tidesdb-online-ddl-test=1 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_analyze.test b/mysql-test/suite/tidesdb/t/tidesdb_analyze.test new file mode 100644 index 0000000000000..e855fd5a35db9 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_analyze.test @@ -0,0 +1,41 @@ +--source include/have_tidesdb.inc +--echo # +--echo # ANALYZE TABLE for TidesDB -- verifies CF stats output +--echo # + +CREATE TABLE t1 ( + id INT PRIMARY KEY, + val VARCHAR(40), + KEY idx_val (val) +) ENGINE=TidesDB; + +INSERT INTO t1 VALUES (1, 'alpha'), (2, 'bravo'), (3, 'charlie'), + (4, 'delta'), (5, 'echo'), (6, 'foxtrot'); + +--echo # ANALYZE TABLE should return status OK and emit CF stats as notes. +--echo # Mask volatile numeric values (memtable size, avg sizes, etc.) +--replace_regex /total_keys=[0-9]+/total_keys=N/ /data_size=[0-9]+/data_size=N/ /memtable=[0-9]+/memtable=N/ /read_amp=[0-9.]+/read_amp=N/ /cache_hit=[0-9.]+/cache_hit=N/ /avg_key=[0-9.]+/avg_key=N/ /avg_value=[0-9.]+/avg_value=N/ /sstables=[0-9]+/sstables=N/ /size=[0-9]+/size=N/ /keys=[0-9]+/keys=N/ +ANALYZE TABLE t1; + +--echo # ANALYZE a table without secondary indexes +CREATE TABLE t2 ( + id INT PRIMARY KEY, + data VARCHAR(200) +) ENGINE=TidesDB; + +INSERT INTO t2 VALUES (1, REPEAT('x', 100)), (2, REPEAT('y', 100)); + +--replace_regex /total_keys=[0-9]+/total_keys=N/ /data_size=[0-9]+/data_size=N/ /memtable=[0-9]+/memtable=N/ /read_amp=[0-9.]+/read_amp=N/ /cache_hit=[0-9.]+/cache_hit=N/ /avg_key=[0-9.]+/avg_key=N/ /avg_value=[0-9.]+/avg_value=N/ /sstables=[0-9]+/sstables=N/ /size=[0-9]+/size=N/ /keys=[0-9]+/keys=N/ +ANALYZE TABLE t2; + +--echo # ANALYZE an empty table +CREATE TABLE t3 ( + id INT PRIMARY KEY +) ENGINE=TidesDB; + +--replace_regex /total_keys=[0-9]+/total_keys=N/ /data_size=[0-9]+/data_size=N/ /memtable=[0-9]+/memtable=N/ /read_amp=[0-9.]+/read_amp=N/ /cache_hit=[0-9.]+/cache_hit=N/ /avg_key=[0-9.]+/avg_key=N/ /avg_value=[0-9.]+/avg_value=N/ /sstables=[0-9]+/sstables=N/ /size=[0-9]+/size=N/ /keys=[0-9]+/keys=N/ +ANALYZE TABLE t3; + +--echo # Cleanup +DROP TABLE t1, t2, t3; +--source suite/tidesdb/include/cleanup_tidesdb.inc diff --git a/mysql-test/suite/tidesdb/t/tidesdb_auto_increment.test b/mysql-test/suite/tidesdb/t/tidesdb_auto_increment.test new file mode 100644 index 0000000000000..37c8191b3d0f5 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_auto_increment.test @@ -0,0 +1,102 @@ +--source include/have_tidesdb.inc +# +# Test: AUTO_INCREMENT edge cases +# + +--echo # +--echo # TEST 1: Basic auto-increment +--echo # + +CREATE TABLE t_ai (id INT AUTO_INCREMENT PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_ai (v) VALUES ('a'), ('b'), ('c'); +SELECT * FROM t_ai ORDER BY id; + +--echo # +--echo # TEST 2: Explicit value larger than counter +--echo # + +INSERT INTO t_ai VALUES (100, 'explicit'); +INSERT INTO t_ai (v) VALUES ('after_explicit'); +SELECT * FROM t_ai ORDER BY id; + +--echo # +--echo # TEST 3: Gap after rollback +--echo # + +BEGIN; +INSERT INTO t_ai (v) VALUES ('will_rollback'); +SELECT MAX(id) FROM t_ai; +ROLLBACK; + +INSERT INTO t_ai (v) VALUES ('after_rollback'); +SELECT id, v FROM t_ai WHERE v IN ('after_rollback', 'after_explicit') ORDER BY id; + +--echo # +--echo # TEST 4: LAST_INSERT_ID +--echo # + +INSERT INTO t_ai (v) VALUES ('last_id_test'); +SELECT LAST_INSERT_ID() > 0 AS has_last_id; + +--echo # +--echo # TEST 5: Auto-increment with REPLACE INTO +--echo # + +CREATE TABLE t_ai_replace ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(50) UNIQUE +) ENGINE=TidesDB; + +INSERT INTO t_ai_replace (name) VALUES ('x'), ('y'), ('z'); +REPLACE INTO t_ai_replace (name) VALUES ('y'); +SELECT * FROM t_ai_replace ORDER BY name; + +--echo # +--echo # TEST 5b: an auto-increment PK must not bypass the UNIQUE secondary check +--echo # + +--error ER_DUP_ENTRY +INSERT INTO t_ai_replace (name) VALUES ('z'); +INSERT INTO t_ai_replace (name) VALUES ('x') + ON DUPLICATE KEY UPDATE name = 'x2'; +SELECT * FROM t_ai_replace ORDER BY name; +--echo # no value may appear twice in the UNIQUE column +SELECT name, COUNT(*) AS c FROM t_ai_replace GROUP BY name HAVING c > 1; + +--echo # +--echo # TEST 6: BIGINT auto-increment +--echo # + +CREATE TABLE t_ai_big (id BIGINT AUTO_INCREMENT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_ai_big (v) VALUES (1), (2), (3); +INSERT INTO t_ai_big VALUES (9999999999, 4); +INSERT INTO t_ai_big (v) VALUES (5); +SELECT * FROM t_ai_big ORDER BY id; + +--echo # +--echo # TEST 7: Auto-increment after TRUNCATE resets counter +--echo # + +TRUNCATE TABLE t_ai; +INSERT INTO t_ai (v) VALUES ('fresh_start'); +SELECT * FROM t_ai; + +--echo # +--echo # TEST 8: ALTER TABLE ... AUTO_INCREMENT=N takes effect +--echo # + +CREATE TABLE t_ai_alter (id INT AUTO_INCREMENT PRIMARY KEY, v VARCHAR(10)) ENGINE=TidesDB; +INSERT INTO t_ai_alter (v) VALUES ('a'), ('b'); +ALTER TABLE t_ai_alter AUTO_INCREMENT=1000; +INSERT INTO t_ai_alter (v) VALUES ('jumped'); +SELECT * FROM t_ai_alter ORDER BY id; +DROP TABLE t_ai_alter; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_ai, t_ai_replace, t_ai_big; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_backup.test b/mysql-test/suite/tidesdb/t/tidesdb_backup.test new file mode 100644 index 0000000000000..003143f3e1cca --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_backup.test @@ -0,0 +1,117 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc + + +# Suppress expected error from Test 2 (backup to non-empty dir) +CALL mtr.add_suppression("\\[TIDESDB\\] Backup to .* failed"); + +--echo # +--echo # ============================================ +--echo # TEST 1: Online backup creates a valid copy +--echo # ============================================ +--echo # + +CREATE TABLE t_backup ( + id INT PRIMARY KEY, + val VARCHAR(100) +) ENGINE=TIDESDB; + +INSERT INTO t_backup VALUES (1, 'alpha'), (2, 'beta'), (3, 'gamma'); + +# Verify data is present +SELECT * FROM t_backup ORDER BY id; + +# Determine backup directory (inside the test's tmp dir) +--let $backup_dir= $MYSQLTEST_VARDIR/tmp/tidesdb_backup_test +--exec rm -rf $backup_dir + +--echo # Triggering online backup +--disable_query_log +eval SET GLOBAL tidesdb_backup_dir = '$backup_dir'; +--enable_query_log + +--echo # Backup should have created the directory +--exec test -d $backup_dir && echo "Backup directory exists: YES" || echo "Backup directory exists: NO" + +--echo # Check that SHOW VARIABLES reflects the backup path +--replace_result $MYSQLTEST_VARDIR MYSQLTEST_VARDIR +SELECT @@GLOBAL.tidesdb_backup_dir IS NOT NULL AS backup_dir_set; + +--echo # Insert more data after backup (should NOT appear in backup) +INSERT INTO t_backup VALUES (4, 'delta'), (5, 'epsilon'); +SELECT COUNT(*) AS rows_after FROM t_backup; + +DROP TABLE t_backup; + +--echo # +--echo # ============================================ +--echo # TEST 2: Backup to existing non-empty dir fails +--echo # ============================================ +--echo # + +--echo # Re-running backup to same directory should fail (not empty) +--replace_result $MYSQLTEST_VARDIR MYSQLTEST_VARDIR +--error ER_UNKNOWN_ERROR +eval SET GLOBAL tidesdb_backup_dir = '$backup_dir'; + +--echo # +--echo # ============================================ +--echo # TEST 3: Clear backup_dir variable +--echo # ============================================ +--echo # + +SET GLOBAL tidesdb_backup_dir = ''; +SELECT @@GLOBAL.tidesdb_backup_dir IS NULL AS backup_dir_cleared; + +--echo # +--echo # ============================================ +--echo # TEST 4: Concurrent reads/writes during backup +--echo # ============================================ +--echo # + +CREATE TABLE t_concurrent ( + id INT PRIMARY KEY, + data VARCHAR(200) +) ENGINE=TIDESDB; + +# Load some data +--disable_query_log +--let $i= 1 +while ($i <= 100) +{ + eval INSERT INTO t_concurrent VALUES ($i, REPEAT('x', 100)); + --inc $i +} +--enable_query_log + +--echo # Inserted 100 rows +SELECT COUNT(*) AS before_backup FROM t_concurrent; + +--let $backup_dir2= $MYSQLTEST_VARDIR/tmp/tidesdb_backup_concurrent +--exec rm -rf $backup_dir2 + +--disable_query_log +eval SET GLOBAL tidesdb_backup_dir = '$backup_dir2'; +--enable_query_log + +--echo # Backup completed while table was loaded + +# Verify the table is still fully readable after backup +SELECT COUNT(*) AS after_backup FROM t_concurrent; + +# Verify writes still work after backup +INSERT INTO t_concurrent VALUES (101, 'post-backup'); +SELECT COUNT(*) AS with_post_backup FROM t_concurrent; + +DROP TABLE t_concurrent; + +--echo # +--echo # === Cleanup === +--echo # + +SET GLOBAL tidesdb_backup_dir = ''; +--exec rm -rf $backup_dir +--exec rm -rf $backup_dir2 + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_bulk_commit_durability.test b/mysql-test/suite/tidesdb/t/tidesdb_bulk_commit_durability.test new file mode 100644 index 0000000000000..43636822b22b2 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_bulk_commit_durability.test @@ -0,0 +1,86 @@ +--source include/have_tidesdb.inc +# +# Bulk-statement commit durability contract. +# +# write_row / update_row / delete_row buffer row writes in the engine txn +# and call maybe_bulk_commit() once the per-statement op counter crosses +# TIDESDB_BULK_INSERT_BATCH_OPS. If the inner tidesdb_txn_commit() fails +# (e.g. a transient unified-memtable rotation race returning TDB_ERR_UNKNOWN) +# the buffered ops are gone -- so the failure MUST propagate up so the SQL +# layer rolls the statement back. Returning success while ops vanish would +# silently drop up to TIDESDB_BULK_INSERT_BATCH_OPS rows per failed commit. +# +# The contract this test asserts: either every row touched by a bulk +# statement is durable on success, or the statement fails loudly with an +# engine error. No silent losses. +# +# Workload: 50 INSERT ... SELECT statements of 1000 rows each, each large +# enough to cross the bulk-commit threshold multiple times. Total row +# count after the run must equal the sum of every batch. +# + +--disable_warnings +DROP TABLE IF EXISTS bulk_src; +DROP TABLE IF EXISTS bulk_dst; +--enable_warnings + +CREATE TABLE bulk_src ( + id INT PRIMARY KEY, + payload VARCHAR(200) +) ENGINE=TIDESDB; + +CREATE TABLE bulk_dst ( + id INT PRIMARY KEY, + payload VARCHAR(200) +) ENGINE=TIDESDB; + +# Seed source with 1000 rows. Each INSERT INTO bulk_dst SELECT ... +# below moves all 1000 across in a single statement -- well above the +# 500-op bulk-commit threshold, so maybe_bulk_commit() fires at least +# once per statement. +--disable_query_log +let $i = 1; +while ($i <= 1000) +{ + eval INSERT INTO bulk_src VALUES ($i, REPEAT('X', 180)); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS src_rows FROM bulk_src; + +--echo # +--echo # Run 50 bulk INSERT ... SELECT statements (50,000 rows total). +--echo # Each statement crosses the bulk-commit threshold, exercising +--echo # the maybe_bulk_commit() path that previously swallowed errors. +--echo # + +--disable_query_log +let $batch = 1; +while ($batch <= 50) +{ + eval INSERT INTO bulk_dst + SELECT id + ($batch - 1) * 1000, payload FROM bulk_src; + inc $batch; +} +--enable_query_log + +--echo # +--echo # Assertion: every row from every batch must be present. If +--echo # maybe_bulk_commit() ever swallows an inner commit failure again, +--echo # this verdict line will read "LOST rows" instead of "OK". +--echo # + +SELECT IF(COUNT(*) = 50000, + 'OK', + CONCAT('LOST ', 50000 - COUNT(*), ' rows of 50000')) + AS verdict +FROM bulk_dst; + +SELECT COUNT(*) AS dst_rows, MIN(id) AS min_id, MAX(id) AS max_id FROM bulk_dst; + +DROP TABLE bulk_src; +DROP TABLE bulk_dst; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_checkpoint.test b/mysql-test/suite/tidesdb/t/tidesdb_checkpoint.test new file mode 100644 index 0000000000000..236c886a5eba5 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_checkpoint.test @@ -0,0 +1,42 @@ +--source include/have_tidesdb.inc +# +# Test: Hard-link checkpoint via tidesdb_checkpoint_dir +# + +--echo # +--echo # TEST 1: Create checkpoint +--echo # + +CREATE TABLE t_ckpt (id INT PRIMARY KEY, val VARCHAR(100)) ENGINE=TidesDB; +INSERT INTO t_ckpt VALUES (1, 'before_checkpoint'), (2, 'data_two'), (3, 'data_three'); + +--let $ckpt_dir=$MYSQLTEST_VARDIR/tmp/tidesdb_checkpoint_test +--error 0 +--exec rm -rf $MYSQLTEST_VARDIR/tmp/tidesdb_checkpoint_test + +--disable_query_log +--eval SET GLOBAL tidesdb_checkpoint_dir = '$ckpt_dir' +--enable_query_log + +--echo # +--echo # TEST 3: Data survives after checkpoint +--echo # + +INSERT INTO t_ckpt VALUES (4, 'after_checkpoint'); +SELECT * FROM t_ckpt ORDER BY id; + +--echo # +--echo # TEST 4: Clear checkpoint dir variable +--echo # + +SET GLOBAL tidesdb_checkpoint_dir = ''; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_ckpt; +--exec rm -rf $MYSQLTEST_VARDIR/tmp/tidesdb_checkpoint_test + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_concurrent_conflict.opt b/mysql-test/suite/tidesdb/t/tidesdb_concurrent_conflict.opt new file mode 100644 index 0000000000000..4fa69806a64ba --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_concurrent_conflict.opt @@ -0,0 +1 @@ +--tidesdb-pessimistic-locking=OFF diff --git a/mysql-test/suite/tidesdb/t/tidesdb_concurrent_conflict.test b/mysql-test/suite/tidesdb/t/tidesdb_concurrent_conflict.test new file mode 100644 index 0000000000000..27644238d4f5d --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_concurrent_conflict.test @@ -0,0 +1,74 @@ +--source include/have_tidesdb.inc +# +# Issue #77: Conflict detection between concurrent transactions. +# Verifies that the second committer gets ER_LOCK_DEADLOCK when +# two transactions modify the same row. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +--echo # +--echo # Issue #77: Concurrent conflict detection +--echo # + +CREATE TABLE t ( + i INT NOT NULL PRIMARY KEY, + x INT +) ENGINE=TidesDB; + +INSERT INTO t VALUES (1,10),(2,20),(3,30),(4,40),(5,50); + +connect (con1, localhost, root,,); +connect (con2, localhost, root,,); + +--echo # ---- TEST 1: Two UPDATEs on same row ---- +connection con1; +START TRANSACTION; +UPDATE t SET x = 999 WHERE i = 1; + +connection con2; +START TRANSACTION; +UPDATE t SET x = 888 WHERE i = 1; +COMMIT; + +connection con1; +--error ER_LOCK_DEADLOCK,ER_ERROR_DURING_COMMIT +COMMIT; + +connection default; +--echo # con2 wins: x should be 888 +SELECT * FROM t WHERE i = 1; + +--echo # ---- TEST 2: UPDATE vs DELETE on same row ---- +connection con1; +START TRANSACTION; +UPDATE t SET x = 777 WHERE i = 2; + +connection con2; +START TRANSACTION; +DELETE FROM t WHERE i = 2; +COMMIT; + +connection con1; +--error ER_LOCK_DEADLOCK,ER_ERROR_DURING_COMMIT +COMMIT; + +connection default; +--echo # con2 wins: row 2 should be gone +SELECT * FROM t WHERE i = 2; + +--echo # Remaining rows intact +SELECT * FROM t ORDER BY i; + +--echo # Cleanup +connection con1; +disconnect con1; +connection con2; +disconnect con2; +connection default; + +DROP TABLE t; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_concurrent_errors.test b/mysql-test/suite/tidesdb/t/tidesdb_concurrent_errors.test new file mode 100644 index 0000000000000..657f62c9f824c --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_concurrent_errors.test @@ -0,0 +1,373 @@ +--source include/have_tidesdb.inc +# +# TidesDB concurrent error mapping test +# +# Validates that transient TidesDB library errors (TDB_ERR_CONFLICT, +# TDB_ERR_LOCKED, TDB_ERR_MEMORY_LIMIT) are mapped to retryable +# MariaDB errors (HA_ERR_LOCK_DEADLOCK / 1213) instead of the +# fatal HA_ERR_GENERIC / 1030 ("Unknown generic error from engine"). +# +# Before the fix, concurrent write workloads (sysbench oltp_read_write +# at 16 threads) would surface error 1030 which sysbench treats as +# FATAL. After the fix, these map to 1213 (deadlock) which +# applications can retry. +# +# The test uses 4 concurrent connections doing overlapping writes +# on the SAME rows inside explicit BEGIN...COMMIT transactions +# (matching the sysbench oltp_read_write pattern) and verifies: +# 1) No error 1030 (HA_ERR_GENERIC) is produced +# 2) Conflicts are retried via CONTINUE HANDLER for 1213/1180 +# 3) Data integrity is maintained (PK scan == index scan) +# + +# Suppress expected warnings from the new tdb_rc_to_ha() error mapper +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_LOCKED"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_MEMORY_LIMIT"); +call mtr.add_suppression("\\[TIDESDB\\].*unexpected TidesDB error"); + +--echo # +--echo # === Setup: sysbench-like schema === +--echo # + +CREATE TABLE t1 ( + id INT NOT NULL AUTO_INCREMENT, + k INT NOT NULL DEFAULT 0, + c CHAR(120) NOT NULL DEFAULT '', + pad CHAR(60) NOT NULL DEFAULT '', + PRIMARY KEY (id), + KEY k_1 (k) +) ENGINE=TIDESDB SYNC_MODE='NONE'; + +--echo # +--echo # === Populate: 2000 rows === +--echo # + +--disable_query_log +--disable_result_log + +let $i= 1; +while ($i <= 2000) +{ + eval INSERT INTO t1 (k, c, pad) VALUES ( + FLOOR(RAND() * 100000), + REPEAT('a', 120), + REPEAT('b', 60) + ); + inc $i; +} + +--enable_result_log +--enable_query_log + +SELECT COUNT(*) AS row_count FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 1: Concurrent oltp_read_write pattern +--echo # 4 connections doing BEGIN...COMMIT with +--echo # interleaved reads + writes on overlapping rows. +--echo # Before fix: error 1030 (HA_ERR_GENERIC) +--echo # After fix: error 1213 (deadlock, retryable) +--echo # ============================================ +--echo # + +connect (c1, localhost, root,,); +connect (c2, localhost, root,,); +connect (c3, localhost, root,,); +connect (c4, localhost, root,,); + +--disable_query_log +--disable_result_log + +# ---- Connection c1: read_write pattern on rows 1-500 ---- +connection c1; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 300 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + SELECT k INTO @dummy FROM t1 WHERE id = 1 + (@i % 500) LIMIT 1; + UPDATE t1 SET k = k + 1 WHERE id = 1 + (@i % 500); + UPDATE t1 SET c = REPEAT(CHAR(65 + (@i % 26)), 120) WHERE id = 1 + ((@i + 100) % 500); + DELETE FROM t1 WHERE id = 1 + ((@i + 200) % 2000); + INSERT INTO t1 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('w',120), REPEAT('z',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Connection c2: overlapping writes on rows 1-500 ---- +connection c2; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 300 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + SELECT k INTO @dummy FROM t1 WHERE id = 1 + ((@i + 50) % 500) LIMIT 1; + UPDATE t1 SET k = k + 1 WHERE id = 1 + ((@i + 50) % 500); + UPDATE t1 SET c = REPEAT(CHAR(65 + (@i % 26)), 120) WHERE id = 1 + ((@i + 150) % 500); + DELETE FROM t1 WHERE id = 1 + ((@i + 250) % 2000); + INSERT INTO t1 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('x',120), REPEAT('y',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Connection c3: writes on rows 500-1000 ---- +connection c3; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 300 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + SELECT k INTO @dummy FROM t1 WHERE id = 500 + (@i % 500) LIMIT 1; + UPDATE t1 SET k = k + 1 WHERE id = 500 + (@i % 500); + UPDATE t1 SET c = REPEAT(CHAR(65 + (@i % 26)), 120) WHERE id = 500 + ((@i + 100) % 500); + DELETE FROM t1 WHERE id = 500 + ((@i + 200) % 1500); + INSERT INTO t1 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('v',120), REPEAT('u',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Connection c4: all autocommit UPDATEs (rapid txn churn) ---- +connection c4; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 300 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + UPDATE t1 SET k = k + 1 WHERE id = 1 + (@i % 2000); + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Reap all ---- +connection c1; +reap; +connection c2; +reap; +connection c3; +reap; +connection c4; +reap; + +--enable_result_log +--enable_query_log + +--echo # +--echo # === Verify: no error 1030 (HA_ERR_GENERIC) was produced === +--echo # + +connection c1; +--echo # c1 error_1030 count: +SELECT @err_1030 AS err_1030_c1; + +connection c2; +--echo # c2 error_1030 count: +SELECT @err_1030 AS err_1030_c2; + +connection c3; +--echo # c3 error_1030 count: +SELECT @err_1030 AS err_1030_c3; + +connection c4; +--echo # c4 error_1030 count: +SELECT @err_1030 AS err_1030_c4; + +connection default; + +--echo # +--echo # === Verify data integrity (PK count == index count) === +--echo # + +let $pk_cnt = `SELECT COUNT(*) FROM t1`; +let $idx_cnt = `SELECT COUNT(*) FROM t1 WHERE k >= 0 OR k < 0`; + +--disable_query_log +if ($pk_cnt != $idx_cnt) +{ + --echo FAIL: PK count ($pk_cnt) != index count ($idx_cnt) +} +--enable_query_log +--echo Data integrity: OK + +--echo # +--echo # ============================================ +--echo # TEST 2: Conflict storm -- all connections hit SAME 3 rows +--echo # Maximizes conflict rate. Before fix these would be +--echo # error 1030; after fix they are error 1213 (retryable). +--echo # ============================================ +--echo # + +--disable_query_log +--disable_result_log + +connection c1; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 200 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + UPDATE t1 SET k = @i WHERE id = 1; + UPDATE t1 SET k = @i WHERE id = 2; + UPDATE t1 SET k = @i WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection c2; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 200 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + UPDATE t1 SET k = @i + 10000 WHERE id = 1; + UPDATE t1 SET k = @i + 10000 WHERE id = 2; + UPDATE t1 SET k = @i + 10000 WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection c3; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 200 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + UPDATE t1 SET k = @i + 20000 WHERE id = 1; + UPDATE t1 SET k = @i + 20000 WHERE id = 2; + UPDATE t1 SET k = @i + 20000 WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection c4; +delimiter |; +send + SET @i = 1; + SET @err_1030 = 0; + WHILE @i <= 200 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + DECLARE CONTINUE HANDLER FOR 1030 + SET @err_1030 = @err_1030 + 1; + START TRANSACTION; + UPDATE t1 SET k = @i + 30000 WHERE id = 1; + UPDATE t1 SET k = @i + 30000 WHERE id = 2; + UPDATE t1 SET k = @i + 30000 WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection c1; +reap; +connection c2; +reap; +connection c3; +reap; +connection c4; +reap; + +--enable_result_log +--enable_query_log + +--echo # +--echo # === Verify: no error 1030 in conflict storm === +--echo # + +connection c1; +--echo # c1 error_1030 count: +SELECT @err_1030 AS err_1030_c1; + +connection c2; +--echo # c2 error_1030 count: +SELECT @err_1030 AS err_1030_c2; + +connection c3; +--echo # c3 error_1030 count: +SELECT @err_1030 AS err_1030_c3; + +connection c4; +--echo # c4 error_1030 count: +SELECT @err_1030 AS err_1030_c4; + +connection default; +--echo Conflict storm: OK + +--echo # +--echo # === Cleanup === +--echo # + +disconnect c1; +disconnect c2; +disconnect c3; +disconnect c4; + +DROP TABLE t1; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_consistent_snapshot.test b/mysql-test/suite/tidesdb/t/tidesdb_consistent_snapshot.test new file mode 100644 index 0000000000000..69ccbc63688bb --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_consistent_snapshot.test @@ -0,0 +1,77 @@ +--source include/have_tidesdb.inc +--echo # +--echo # Issue #64: WITH CONSISTENT SNAPSHOT doesn't work +--echo # + +CREATE TABLE t_snap64 ( + a INT, + b INT +) ENGINE=TidesDB; + +--echo # Seed some data so global_seq > 0 +INSERT INTO t_snap64 VALUES (100, 100); +DELETE FROM t_snap64 WHERE a = 100; + +--echo # ---- TEST 1: START TRANSACTION WITH CONSISTENT SNAPSHOT ---- + +connect (con2, localhost, root,,); +connection default; + +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +--echo # Insert from connection 2 AFTER snapshot +connection con2; +INSERT INTO t_snap64 (a, b) VALUES (1, 10); +SELECT * FROM t_snap64 ORDER BY a; + +--echo # Connection 1 should NOT see the row (snapshot was before insert) +connection default; +SELECT * FROM t_snap64 ORDER BY a; + +COMMIT; + +--echo # After COMMIT, a new snapshot should see the row +SELECT * FROM t_snap64 ORDER BY a; + +--echo # ---- TEST 2: Multiple inserts after snapshot ---- + +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +connection con2; +INSERT INTO t_snap64 (a, b) VALUES (2, 20); +INSERT INTO t_snap64 (a, b) VALUES (3, 30); + +connection default; +--echo # Should still only see row (1,10) from before the snapshot +SELECT * FROM t_snap64 ORDER BY a; + +COMMIT; + +--echo # After COMMIT, should see all 3 rows +SELECT * FROM t_snap64 ORDER BY a; + +--echo # ---- TEST 3: Without CONSISTENT SNAPSHOT, new data IS visible ---- + +BEGIN; + +connection con2; +INSERT INTO t_snap64 (a, b) VALUES (4, 40); + +connection default; +--echo # Without CONSISTENT SNAPSHOT, should see all 4 rows +SELECT * FROM t_snap64 ORDER BY a; + +COMMIT; + +--echo # Cleanup +connection con2; +disconnect con2; +connection default; + +DROP TABLE t_snap64; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_crud.opt b/mysql-test/suite/tidesdb/t/tidesdb_crud.opt new file mode 100644 index 0000000000000..468f32587c637 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_crud.opt @@ -0,0 +1 @@ +--loose-tidesdb-crud-test=1 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_crud.test b/mysql-test/suite/tidesdb/t/tidesdb_crud.test new file mode 100644 index 0000000000000..a2a9289cf17dd --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_crud.test @@ -0,0 +1,342 @@ +--source include/have_tidesdb.inc +# +# Test suite for the TIDESDB storage engine. +# Exercises every CRUD capability and edge case. +# + +--echo # +--echo # === Setup: install the TIDESDB engine plugin === +--echo # +--replace_regex /\.dll/.so/ + +--echo # +--echo # ============================================ +--echo # TEST 1: CREATE TABLE / SHOW CREATE TABLE +--echo # ============================================ +--echo # + +CREATE TABLE t1 ( + id INT, + name VARCHAR(100), + score DECIMAL(10,2), + bio TEXT, + born DATE +) ENGINE=TIDESDB; + +SHOW CREATE TABLE t1; + +--echo # +--echo # ============================================ +--echo # TEST 2: INSERT - single row +--echo # ============================================ +--echo # + +INSERT INTO t1 VALUES (1, 'Alice', 95.50, 'First student', '2000-01-15'); + +SELECT * FROM t1; +SELECT COUNT(*) AS cnt FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 3: INSERT - multiple rows at once +--echo # ============================================ +--echo # + +INSERT INTO t1 VALUES + (2, 'Bob', 88.00, 'Second student', '1999-06-20'), + (3, 'Charlie', 72.25, 'Third student', '2001-11-03'), + (4, 'Diana', 91.10, 'Fourth student', '1998-03-30'), + (5, 'Eve', 67.80, 'Fifth student', '2002-08-12'); + +SELECT * FROM t1; +SELECT COUNT(*) AS cnt FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 4: SELECT with WHERE (full scan + filter) +--echo # ============================================ +--echo # + +SELECT * FROM t1 WHERE id = 3; +SELECT * FROM t1 WHERE score > 90; +SELECT * FROM t1 WHERE name LIKE '%li%'; +SELECT id, name FROM t1 WHERE id >= 2 AND id <= 4; + +--echo # +--echo # ============================================ +--echo # TEST 5: SELECT with ORDER BY +--echo # (exercises position() and rnd_pos()) +--echo # ============================================ +--echo # + +SELECT * FROM t1 ORDER BY score ASC; +SELECT * FROM t1 ORDER BY name DESC; + +--echo # +--echo # ============================================ +--echo # TEST 6: SELECT aggregate functions +--echo # ============================================ +--echo # + +SELECT MIN(score) AS min_s, MAX(score) AS max_s, AVG(score) AS avg_s FROM t1; +SELECT SUM(id) AS sum_id FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 7: UPDATE - single row via WHERE +--echo # ============================================ +--echo # + +UPDATE t1 SET score = 99.99 WHERE id = 1; +SELECT * FROM t1 WHERE id = 1; + +--echo # +--echo # ============================================ +--echo # TEST 8: UPDATE - multiple rows +--echo # ============================================ +--echo # + +UPDATE t1 SET bio = 'Updated bio' WHERE id IN (2, 4); +SELECT id, bio FROM t1 WHERE id IN (2, 4); + +--echo # +--echo # ============================================ +--echo # TEST 9: UPDATE - all rows (no WHERE) +--echo # ============================================ +--echo # + +UPDATE t1 SET name = CONCAT(name, '!'); +SELECT id, name FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 10: DELETE - single row +--echo # ============================================ +--echo # + +DELETE FROM t1 WHERE id = 3; +SELECT COUNT(*) AS cnt FROM t1; +SELECT * FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 11: DELETE - multiple rows via WHERE +--echo # ============================================ +--echo # + +DELETE FROM t1 WHERE score < 90; +SELECT COUNT(*) AS cnt FROM t1; +SELECT * FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 12: SELECT from empty result set +--echo # ============================================ +--echo # + +SELECT * FROM t1 WHERE id = 999; + +--echo # +--echo # ============================================ +--echo # TEST 13: DELETE - all remaining rows via DELETE +--echo # ============================================ +--echo # + +DELETE FROM t1; +SELECT COUNT(*) AS cnt FROM t1; +SELECT * FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 14: Re-insert after full delete +--echo # ============================================ +--echo # + +INSERT INTO t1 VALUES (10, 'Zara', 100.00, 'Re-inserted', '2005-05-05'); +SELECT * FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 15: TRUNCATE TABLE (delete_all_rows) +--echo # ============================================ +--echo # + +INSERT INTO t1 VALUES (11, 'Yuki', 55.00, 'Will be truncated', '2006-06-06'); +SELECT COUNT(*) AS cnt FROM t1; + +TRUNCATE TABLE t1; +SELECT COUNT(*) AS cnt FROM t1; + +--echo # +--echo # ============================================ +--echo # TEST 16: NULL handling +--echo # ============================================ +--echo # + +INSERT INTO t1 VALUES (20, NULL, NULL, NULL, NULL); +INSERT INTO t1 VALUES (21, 'NotNull', 50.00, 'has data', '2010-01-01'); +SELECT * FROM t1; +SELECT * FROM t1 WHERE name IS NULL; +SELECT * FROM t1 WHERE name IS NOT NULL; + +--echo # +--echo # ============================================ +--echo # TEST 17: Multiple data types stress +--echo # ============================================ +--echo # + +DROP TABLE t1; + +CREATE TABLE t2 ( + tiny_col TINYINT, + small_col SMALLINT, + med_col MEDIUMINT, + int_col INT, + big_col BIGINT, + float_col FLOAT, + double_col DOUBLE, + dec_col DECIMAL(20,5), + char_col CHAR(50), + vchar_col VARCHAR(200), + text_col TEXT, + date_col DATE, + dt_col DATETIME, + ts_col TIMESTAMP NULL +) ENGINE=TIDESDB; + +INSERT INTO t2 VALUES ( + 127, 32767, 8388607, 2147483647, 9223372036854775807, + 3.14, 2.718281828, 12345.67890, + 'fixed', 'variable length', 'long text here', + '2025-12-31', '2025-12-31 23:59:59', '2025-06-15 12:00:00' +); + +SELECT * FROM t2; + +UPDATE t2 SET char_col = 'UPDATED', int_col = 42; +SELECT char_col, int_col FROM t2; + +DELETE FROM t2; +SELECT COUNT(*) AS cnt FROM t2; + +DROP TABLE t2; + +--echo # +--echo # ============================================ +--echo # TEST 18: Multiple independent tables +--echo # ============================================ +--echo # + +CREATE TABLE ta (a INT, val VARCHAR(20)) ENGINE=TIDESDB; +CREATE TABLE tb (b INT, val VARCHAR(20)) ENGINE=TIDESDB; + +INSERT INTO ta VALUES (1, 'ta_one'), (2, 'ta_two'); +INSERT INTO tb VALUES (1, 'tb_one'), (3, 'tb_three'); + +SELECT * FROM ta; +SELECT * FROM tb; + +# Cross-table query (nested loop join - both do full scans) +SELECT ta.a, ta.val, tb.b, tb.val FROM ta, tb WHERE ta.a = tb.b; + +DROP TABLE ta, tb; + +--echo # +--echo # ============================================ +--echo # TEST 19: Empty table scan (no rows ever inserted) +--echo # ============================================ +--echo # + +CREATE TABLE t_empty (x INT) ENGINE=TIDESDB; +SELECT * FROM t_empty; +SELECT COUNT(*) AS cnt FROM t_empty; +DROP TABLE t_empty; + +--echo # +--echo # ============================================ +--echo # TEST 20: REPLACE (DELETE + INSERT internally) +--echo # ============================================ +--echo # + +CREATE TABLE t3 (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t3 VALUES (1, 'original'); +SELECT * FROM t3; +DROP TABLE t3; + +--echo # +--echo # ============================================ +--echo # TEST 21: INSERT ... SELECT +--echo # ============================================ +--echo # + +CREATE TABLE t_src (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +CREATE TABLE t_dst (id INT, val VARCHAR(50)) ENGINE=TIDESDB; + +INSERT INTO t_src VALUES (1, 'aaa'), (2, 'bbb'), (3, 'ccc'); +INSERT INTO t_dst SELECT * FROM t_src; +SELECT * FROM t_dst; + +DROP TABLE t_src, t_dst; + +--echo # +--echo # ============================================ +--echo # TEST 22: UPDATE with expression +--echo # ============================================ +--echo # + +CREATE TABLE t4 (id INT, counter INT) ENGINE=TIDESDB; +INSERT INTO t4 VALUES (1, 0), (2, 10), (3, 20); +UPDATE t4 SET counter = counter + 5; +SELECT * FROM t4; +UPDATE t4 SET counter = counter * 2 WHERE id > 1; +SELECT * FROM t4; +DROP TABLE t4; + +--echo # +--echo # ============================================ +--echo # TEST 23: Large-ish batch insert + delete +--echo # ============================================ +--echo # + +CREATE TABLE t_batch (id INT, padding VARCHAR(100)) ENGINE=TIDESDB; + +--disable_query_log +let $i= 1; +while ($i <= 100) +{ + eval INSERT INTO t_batch VALUES ($i, REPEAT('x', 50)); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS cnt FROM t_batch; + +DELETE FROM t_batch WHERE id > 50; +SELECT COUNT(*) AS cnt FROM t_batch; + +DELETE FROM t_batch WHERE id <= 25; +SELECT COUNT(*) AS cnt FROM t_batch; + +TRUNCATE TABLE t_batch; +SELECT COUNT(*) AS cnt FROM t_batch; + +DROP TABLE t_batch; + +--echo # +--echo # ============================================ +--echo # TEST 24: DROP TABLE (delete_table) +--echo # ============================================ +--echo # + +CREATE TABLE t_drop (a INT) ENGINE=TIDESDB; +INSERT INTO t_drop VALUES (1), (2), (3); +DROP TABLE t_drop; +--error ER_NO_SUCH_TABLE +SELECT * FROM t_drop; + +--echo # +--echo # + + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_data_home_dir.test b/mysql-test/suite/tidesdb/t/tidesdb_data_home_dir.test new file mode 100644 index 0000000000000..c78a4789a71c6 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_data_home_dir.test @@ -0,0 +1,17 @@ +--source include/have_tidesdb.inc +# +# Issue #76: tidesdb_data_home_dir system variable +# + +--echo # +--echo # Verify tidesdb_data_home_dir is visible and read-only +--echo # + +SHOW VARIABLES LIKE 'tidesdb_data_home_dir'; + +--error ER_INCORRECT_GLOBAL_LOCAL_VAR +SET GLOBAL tidesdb_data_home_dir = '/tmp/test'; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_defaults_alignment.test b/mysql-test/suite/tidesdb/t/tidesdb_defaults_alignment.test new file mode 100644 index 0000000000000..ecdb742437ce6 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_defaults_alignment.test @@ -0,0 +1,39 @@ +--source include/have_tidesdb.inc +# +# Pin the per-table-option defaults so any future drift from the TidesDB +# library's tidesdb_default_column_family_config (or from the deliberate +# SQL-side deviations called out in the README) is caught here. +# Library-aligned defaults are listed first; deliberate deviations from +# the library are at the bottom with the rationale recorded in the README. +# + +--echo # library-aligned column-family defaults +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_min_levels'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_dividing_level_offset'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_level_size_ratio'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_klog_value_threshold'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_bloom_filter'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_bloom_fpr'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_block_indexes'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_index_sample_ratio'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_block_index_prefix_len'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_skip_list_max_level'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_skip_list_probability'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_min_disk_space'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_l1_file_count_trigger'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_l0_queue_stall_threshold'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_tombstone_density_trigger'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_tombstone_density_min_entries'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_compression'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_use_btree'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_object_lazy_compaction'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_object_prefetch_compaction'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_sync_interval_us'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_write_buffer_size'; + +--echo # deliberate deviations from the library default, see README +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_sync_mode'; +SHOW GLOBAL VARIABLES LIKE 'tidesdb_default_isolation_level'; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_drop_create.test b/mysql-test/suite/tidesdb/t/tidesdb_drop_create.test new file mode 100644 index 0000000000000..9e3b96bd14f41 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_drop_create.test @@ -0,0 +1,76 @@ +--source include/have_tidesdb.inc +--echo # +--echo # Issue #57: Data survives DROP + CREATE +--echo # + +--echo # ---- TEST 1: DROP TABLE must destroy data ---- +CREATE TABLE t_drop57 (i INT NOT NULL PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; + +INSERT INTO t_drop57 VALUES (1, 'aaa'), (2, 'bbb'), (3, 'ccc'); +SELECT * FROM t_drop57 ORDER BY i; + +DROP TABLE t_drop57; + +CREATE TABLE t_drop57 (i INT NOT NULL PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; + +--echo # Must be empty after DROP + CREATE +SELECT COUNT(*) FROM t_drop57; +SELECT * FROM t_drop57 ORDER BY i; + +DROP TABLE t_drop57; + +--echo # ---- TEST 2: CREATE OR REPLACE must destroy data ---- +CREATE TABLE t_cor57 (i INT) ENGINE=TidesDB; + +INSERT INTO t_cor57 VALUES (10), (20), (30); +SELECT * FROM t_cor57 ORDER BY i; + +CREATE OR REPLACE TABLE t_cor57 (i INT) ENGINE=TidesDB; + +--echo # Must be empty after CREATE OR REPLACE +SELECT COUNT(*) FROM t_cor57; +SELECT * FROM t_cor57 ORDER BY i; + +DROP TABLE t_cor57; + +--echo # ---- TEST 3: Secondary indexes must also be cleaned ---- +CREATE TABLE t_idx57 ( + id INT NOT NULL PRIMARY KEY, + val INT NOT NULL, + KEY idx_val (val) +) ENGINE=TidesDB; + +INSERT INTO t_idx57 VALUES (1, 100), (2, 200), (3, 300); +SELECT * FROM t_idx57 ORDER BY id; +SELECT val FROM t_idx57 WHERE val = 200; + +DROP TABLE t_idx57; + +CREATE TABLE t_idx57 ( + id INT NOT NULL PRIMARY KEY, + val INT NOT NULL, + KEY idx_val (val) +) ENGINE=TidesDB; + +--echo # Must be empty after DROP + CREATE (including index) +SELECT COUNT(*) FROM t_idx57; +SELECT * FROM t_idx57 ORDER BY id; +SELECT val FROM t_idx57 WHERE val = 200; + +DROP TABLE t_idx57; + +--echo # ---- TEST 4: TRUNCATE TABLE still works ---- +CREATE TABLE t_trunc57 (i INT NOT NULL PRIMARY KEY) ENGINE=TidesDB; + +INSERT INTO t_trunc57 VALUES (1), (2), (3); +SELECT COUNT(*) FROM t_trunc57; + +TRUNCATE TABLE t_trunc57; + +SELECT COUNT(*) FROM t_trunc57; + +DROP TABLE t_trunc57; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_encryption.opt b/mysql-test/suite/tidesdb/t/tidesdb_encryption.opt new file mode 100644 index 0000000000000..5737dfcaaa1ef --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_encryption.opt @@ -0,0 +1,2 @@ +--plugin-load-add=file_key_management +--file-key-management-filename=$MYSQL_TEST_DIR/std_data/keys.txt diff --git a/mysql-test/suite/tidesdb/t/tidesdb_encryption.test b/mysql-test/suite/tidesdb/t/tidesdb_encryption.test new file mode 100644 index 0000000000000..91b150b9bd895 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_encryption.test @@ -0,0 +1,144 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc +--source include/have_file_key_management.inc + +--echo # +--echo # ============================================ +--echo # TEST 1: Basic encrypted table - CRUD +--echo # ============================================ +--echo # + +CREATE TABLE t_enc1 ( + id INT NOT NULL PRIMARY KEY, + val VARCHAR(100) +) ENGINE=TIDESDB `ENCRYPTED`=YES; + +INSERT INTO t_enc1 VALUES (1, 'secret_one'); +INSERT INTO t_enc1 VALUES (2, 'secret_two'); +INSERT INTO t_enc1 VALUES (3, 'secret_three'); + +SELECT * FROM t_enc1 ORDER BY id; + +UPDATE t_enc1 SET val = 'updated_secret' WHERE id = 2; +SELECT * FROM t_enc1 WHERE id = 2; + +DELETE FROM t_enc1 WHERE id = 1; +SELECT * FROM t_enc1 ORDER BY id; + +DROP TABLE t_enc1; + +--echo # +--echo # ============================================ +--echo # TEST 2: SHOW CREATE TABLE shows ENCRYPTED option +--echo # ============================================ +--echo # + +CREATE TABLE t_enc2 ( + id INT NOT NULL PRIMARY KEY, + name VARCHAR(50), + amount INT +) ENGINE=TIDESDB `ENCRYPTED`=YES `ENCRYPTION_KEY_ID`=2; + +SHOW CREATE TABLE t_enc2; +INSERT INTO t_enc2 VALUES (1, 'alice', 100); +SELECT * FROM t_enc2; + +DROP TABLE t_enc2; + +--echo # +--echo # ============================================ +--echo # TEST 3: Non-encrypted table still works +--echo # ============================================ +--echo # + +CREATE TABLE t_noenc ( + id INT NOT NULL PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB; + +INSERT INTO t_noenc VALUES (1, 'plain_text'); +SELECT * FROM t_noenc; + +DROP TABLE t_noenc; + +--echo # +--echo # ============================================ +--echo # TEST 4: Encrypted table with secondary index +--echo # ============================================ +--echo # + +CREATE TABLE t_enc_idx ( + id INT NOT NULL PRIMARY KEY, + name VARCHAR(50), + age INT, + KEY idx_name (name) +) ENGINE=TIDESDB `ENCRYPTED`=YES; + +INSERT INTO t_enc_idx VALUES (1, 'alice', 30); +INSERT INTO t_enc_idx VALUES (2, 'bob', 25); +INSERT INTO t_enc_idx VALUES (3, 'charlie', 35); +INSERT INTO t_enc_idx VALUES (4, 'alice', 28); + +SELECT * FROM t_enc_idx WHERE name = 'alice' ORDER BY id; +SELECT * FROM t_enc_idx ORDER BY id; + +DROP TABLE t_enc_idx; + +--echo # +--echo # ============================================ +--echo # TEST 5: Encrypted table with AUTO_INCREMENT +--echo # ============================================ +--echo # + +CREATE TABLE t_enc_auto ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + data VARCHAR(100) +) ENGINE=TIDESDB `ENCRYPTED`=YES; + +INSERT INTO t_enc_auto (data) VALUES ('row_a'); +INSERT INTO t_enc_auto (data) VALUES ('row_b'); +INSERT INTO t_enc_auto (data) VALUES ('row_c'); + +SELECT * FROM t_enc_auto ORDER BY id; + +DROP TABLE t_enc_auto; + +--echo # +--echo # ============================================ +--echo # TEST 6: Encrypted table with BLOB data +--echo # ============================================ +--echo # + +CREATE TABLE t_enc_blob ( + id INT NOT NULL PRIMARY KEY, + payload BLOB +) ENGINE=TIDESDB `ENCRYPTED`=YES; + +INSERT INTO t_enc_blob VALUES (1, REPEAT('A', 500)); +INSERT INTO t_enc_blob VALUES (2, REPEAT('B', 1000)); + +SELECT id, LENGTH(payload) AS plen, LEFT(payload, 5) AS head FROM t_enc_blob ORDER BY id; + +DROP TABLE t_enc_blob; + +--echo # +--echo # ============================================ +--echo # TEST 7: Encrypted table with NULL values +--echo # ============================================ +--echo # + +CREATE TABLE t_enc_null ( + id INT NOT NULL PRIMARY KEY, + val VARCHAR(50) NULL +) ENGINE=TIDESDB `ENCRYPTED`=YES; + +INSERT INTO t_enc_null VALUES (1, NULL); +INSERT INTO t_enc_null VALUES (2, 'not_null'); +INSERT INTO t_enc_null VALUES (3, NULL); + +SELECT * FROM t_enc_null ORDER BY id; + +DROP TABLE t_enc_null; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_encryption_rotation.opt b/mysql-test/suite/tidesdb/t/tidesdb_encryption_rotation.opt new file mode 100644 index 0000000000000..3a550eb53a1e4 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_encryption_rotation.opt @@ -0,0 +1 @@ +--plugin-load-add=debug_key_management diff --git a/mysql-test/suite/tidesdb/t/tidesdb_encryption_rotation.test b/mysql-test/suite/tidesdb/t/tidesdb_encryption_rotation.test new file mode 100644 index 0000000000000..97907f2ba8d26 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_encryption_rotation.test @@ -0,0 +1,43 @@ +--source include/have_tidesdb.inc +# +# Encrypted rows must remain readable after an encryption key rotation. +# Every encrypted row stores the key version it was written under, so a +# row encrypted before a rotation still decrypts with its original key +# instead of the current latest one. debug_key_management supplies a key +# whose version is advanced through SET GLOBAL debug_key_management_version. +# + +--echo # +--echo # rows encrypted under key version 1 +--echo # +CREATE TABLE enc (id INT PRIMARY KEY, payload VARCHAR(200)) ENGINE=TidesDB `ENCRYPTED`=YES; +INSERT INTO enc VALUES (1,'written under version one'),(2,'also version one'); +SELECT * FROM enc ORDER BY id; + +--echo # +--echo # rotate the key, then write rows under key version 2 +--echo # +SET GLOBAL debug_key_management_version = 2; +INSERT INTO enc VALUES (3,'written under version two'),(4,'also version two'); + +--echo # all four rows decrypt, the first two under v1 and the rest under v2 +SELECT * FROM enc ORDER BY id; + +--echo # +--echo # rotate again and confirm all three key vintages still read back +--echo # +SET GLOBAL debug_key_management_version = 3; +INSERT INTO enc VALUES (5,'written under version three'); +SELECT * FROM enc ORDER BY id; + +--echo # +--echo # a fresh open of the table still reads every version +--echo # +FLUSH TABLES; +SELECT * FROM enc ORDER BY id; + +DROP TABLE enc; +SET GLOBAL debug_key_management_version = DEFAULT; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_engine_convert.test b/mysql-test/suite/tidesdb/t/tidesdb_engine_convert.test new file mode 100644 index 0000000000000..e26f9c5b2dda4 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_engine_convert.test @@ -0,0 +1,107 @@ +--source include/have_tidesdb.inc +--source include/have_innodb.inc +# +# Test: ALTER TABLE ENGINE conversion (InnoDB <-> TidesDB) +# Including migration from InnoDB to TidesDB with data preservation +# + +--echo # +--echo # TEST 1: InnoDB -> TidesDB migration +--echo # + +CREATE TABLE t_innodb ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(100), + val DECIMAL(10,2), + created DATETIME DEFAULT CURRENT_TIMESTAMP, + KEY idx_name (name) +) ENGINE=InnoDB; + +INSERT INTO t_innodb (name, val) VALUES ('alpha', 1.50), ('beta', 2.75), ('gamma', 3.00); +INSERT INTO t_innodb (name, val) VALUES ('delta', 4.25), ('epsilon', 5.50); + +SELECT id, name, val FROM t_innodb ORDER BY id; + +ALTER TABLE t_innodb ENGINE=TidesDB; + +SHOW CREATE TABLE t_innodb; +SELECT id, name, val FROM t_innodb ORDER BY id; +SELECT name FROM t_innodb WHERE name = 'gamma'; + +--echo # +--echo # TEST 2: TidesDB -> InnoDB migration +--echo # + +ALTER TABLE t_innodb ENGINE=InnoDB; + +SELECT id, name, val FROM t_innodb ORDER BY id; +SELECT name FROM t_innodb WHERE name = 'delta'; + +--echo # +--echo # TEST 3: Round-trip InnoDB -> TidesDB -> InnoDB +--echo # + +CREATE TABLE t_round (id INT PRIMARY KEY, data TEXT) ENGINE=InnoDB; +INSERT INTO t_round VALUES (1, REPEAT('X', 5000)), (2, REPEAT('Y', 5000)); + +ALTER TABLE t_round ENGINE=TidesDB; +SELECT id, LENGTH(data) FROM t_round ORDER BY id; + +ALTER TABLE t_round ENGINE=InnoDB; +SELECT id, LENGTH(data) FROM t_round ORDER BY id; + +--echo # +--echo # TEST 4: Migration with BLOB columns +--echo # + +CREATE TABLE t_blob_mig ( + id INT PRIMARY KEY, + img LONGBLOB, + descr TEXT +) ENGINE=InnoDB; + +INSERT INTO t_blob_mig VALUES (1, REPEAT('A', 100000), 'first image'); +INSERT INTO t_blob_mig VALUES (2, REPEAT('B', 100000), 'second image'); + +ALTER TABLE t_blob_mig ENGINE=TidesDB; +SELECT id, LENGTH(img), descr FROM t_blob_mig ORDER BY id; + +--echo # +--echo # TEST 5: Migration preserves auto-increment +--echo # + +CREATE TABLE t_ai (id INT AUTO_INCREMENT PRIMARY KEY, v INT) ENGINE=InnoDB; +INSERT INTO t_ai (v) VALUES (10), (20), (30); + +ALTER TABLE t_ai ENGINE=TidesDB; +INSERT INTO t_ai (v) VALUES (40); +SELECT * FROM t_ai ORDER BY id; + +--echo # +--echo # TEST 6: Migration with composite PK and multiple indexes +--echo # + +CREATE TABLE t_complex ( + a INT NOT NULL, + b INT NOT NULL, + c VARCHAR(50), + d INT, + PRIMARY KEY (a, b), + KEY idx_c (c), + KEY idx_d (d) +) ENGINE=InnoDB; + +INSERT INTO t_complex VALUES (1,1,'foo',100), (1,2,'bar',200), (2,1,'baz',100); + +ALTER TABLE t_complex ENGINE=TidesDB; +SELECT * FROM t_complex WHERE a = 1 ORDER BY b; +SELECT c FROM t_complex WHERE d = 100 ORDER BY c; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_innodb, t_round, t_blob_mig, t_ai, t_complex; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_engine_status.test b/mysql-test/suite/tidesdb/t/tidesdb_engine_status.test new file mode 100644 index 0000000000000..fe5fffec9adfb --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_engine_status.test @@ -0,0 +1,21 @@ +--source include/have_tidesdb.inc +# +# Issue #73: SHOW ENGINE TIDESDB STATUS +# + +--echo # +--echo # SHOW ENGINE TIDESDB STATUS should return output +--echo # + +CREATE TABLE t1 (id INT PRIMARY KEY, val INT) ENGINE=TidesDB; +INSERT INTO t1 VALUES (1,10),(2,20),(3,30); + +# Mask the data directory path (varies per build) and volatile numbers +--replace_regex /Data directory: [^\n]*/Data directory: TIDESDB_DATA_DIR/ /[0-9]+/N/ +SHOW ENGINE TIDESDB STATUS; + +DROP TABLE t1; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_fts_blend_chars.test b/mysql-test/suite/tidesdb/t/tidesdb_fts_blend_chars.test new file mode 100644 index 0000000000000..16623940188d9 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_fts_blend_chars.test @@ -0,0 +1,67 @@ +--source include/have_tidesdb.inc +--source include/force_restart.inc + +--echo # +--echo # TidesDB FTS blend_chars support for Romance language elision +--echo # + +SET GLOBAL tidesdb_fts_blend_chars = "'"; + +CREATE TABLE docs ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + body TEXT, + FULLTEXT KEY ft_body (body) +) ENGINE=TidesDB; + +INSERT INTO docs (body) VALUES + ("L'aria fresca della montagna"), + ("Dell'aria pura si respira bene"), + ("Un'aria di festa pervadeva la piazza"), + ("O'Malley went to the store"), + ("The cat sat on the mat"); + +--echo # Sub-part search: aria matches Italian elision docs +SELECT id FROM docs WHERE MATCH(body) AGAINST('aria') ORDER BY id; + +--echo # Blended form: l'aria ranks doc 1 highest +SELECT id FROM docs WHERE MATCH(body) AGAINST("l'aria") ORDER BY id; + +--echo # Sub-part: malley finds O'Malley +SELECT id FROM docs WHERE MATCH(body) AGAINST('malley') ORDER BY id; + +--echo # Blended form: o'malley +SELECT id FROM docs WHERE MATCH(body) AGAINST("o'malley") ORDER BY id; + +--echo # Blended form: dell'aria +SELECT id FROM docs WHERE MATCH(body) AGAINST("dell'aria") ORDER BY id; + +--echo # Non-blend word: cat (should still work) +SELECT id FROM docs WHERE MATCH(body) AGAINST('cat') ORDER BY id; + +--echo # Stop word through blend: the (still filtered) +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); + +--echo # Boolean mode with blend chars +SELECT id FROM docs WHERE MATCH(body) AGAINST("+aria -malley" IN BOOLEAN MODE) ORDER BY id; + +--echo # Update with blended content +UPDATE docs SET body = "L'orchestra dell'opera suona bene" WHERE id = 5; +SELECT id FROM docs WHERE MATCH(body) AGAINST('orchestra') ORDER BY id; +SELECT id FROM docs WHERE MATCH(body) AGAINST("dell'opera") ORDER BY id; + +--echo # Insert more elision forms +INSERT INTO docs (body) VALUES + ("Nell'acqua limpida del lago"), + ("All'interno del castello medievale"); + +SELECT id FROM docs WHERE MATCH(body) AGAINST('acqua') ORDER BY id; +SELECT id FROM docs WHERE MATCH(body) AGAINST("nell'acqua") ORDER BY id; +SELECT id FROM docs WHERE MATCH(body) AGAINST('interno') ORDER BY id; + +--echo # Verify sysvar +SHOW GLOBAL VARIABLES LIKE 'tidesdb_fts_blend_chars'; + +--echo # Reset blend chars +SET GLOBAL tidesdb_fts_blend_chars = NULL; + +DROP TABLE docs; diff --git a/mysql-test/suite/tidesdb/t/tidesdb_fts_stopword_table.test b/mysql-test/suite/tidesdb/t/tidesdb_fts_stopword_table.test new file mode 100644 index 0000000000000..bbd4a4792d912 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_fts_stopword_table.test @@ -0,0 +1,31 @@ +--source include/have_tidesdb.inc +# +# A user-supplied full-text stop word table must resolve to its TidesDB +# column family. CF names join the database and table with CF_DB_TABLE_SEP, +# so the lookup has to use that separator and not the slash from the +# db/table spec. When the lookup failed the custom words were never +# loaded and the stop word set stayed at the built in default list. +# + +--echo # a TidesDB table holding one custom stop word per row +CREATE TABLE swords (value VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO swords VALUES ('zebra'), ('quokka'); + +--echo # point the engine at the custom stop word table +SET GLOBAL tidesdb_ft_stopword_table = 'test/swords'; + +--echo # build a full-text document that contains a custom stop word +CREATE TABLE docs (id INT PRIMARY KEY, body TEXT, FULLTEXT (body)) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'zebra crossing beside the apple tree'); + +--echo # zebra is now a stop word, so it is never indexed and matches nothing +SELECT id FROM docs WHERE MATCH(body) AGAINST('zebra' IN BOOLEAN MODE); +--echo # a normal word still matches +SELECT id FROM docs WHERE MATCH(body) AGAINST('apple' IN BOOLEAN MODE); + +DROP TABLE docs; +DROP TABLE swords; +SET GLOBAL tidesdb_ft_stopword_table = DEFAULT; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_fts_stopwords.test b/mysql-test/suite/tidesdb/t/tidesdb_fts_stopwords.test new file mode 100644 index 0000000000000..f7d895ff24a34 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_fts_stopwords.test @@ -0,0 +1,75 @@ +--source include/have_tidesdb.inc +--source include/force_restart.inc + +--echo # +--echo # TidesDB FTS stop word filtering +--echo # + +CREATE TABLE docs ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + body TEXT, + FULLTEXT KEY ft_body (body) +) ENGINE=TidesDB; + +INSERT INTO docs (body) VALUES + ('The quick brown fox jumps over the lazy dog'), + ('A man is walking in the park with his dog'), + ('How to build a house from scratch'), + ('This is a test of the emergency broadcast system'), + ('The cat sat on the mat by the door'); + +--echo # Stop words should return 0 rows +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('is'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('a'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('of'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('in'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('on'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('by'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('with'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('for'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('this'); + +--echo # Real words should return matches +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('fox'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('dog'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('house'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('cat'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('emergency'); + +--echo # Boolean mode with stop words +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('+dog' IN BOOLEAN MODE); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('+the' IN BOOLEAN MODE); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('+dog -cat' IN BOOLEAN MODE); + +--echo # Multi-word query mixing stop words and real words +SELECT id FROM docs WHERE MATCH(body) AGAINST('quick brown') ORDER BY id; +SELECT id FROM docs WHERE MATCH(body) AGAINST('build house') ORDER BY id; + +--echo # Verify stop word sysvar exists and defaults +SHOW GLOBAL VARIABLES LIKE 'tidesdb_ft_stopword_table'; + +--echo # Insert more rows after initial index creation +INSERT INTO docs (body) VALUES + ('The world is a beautiful place to live in'), + ('Building bridges for the future of our community'); + +--echo # Stop words still filtered for new rows +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('is'); + +--echo # Real words from new rows work +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('beautiful'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('bridges'); + +--echo # UPDATE should maintain stop word filtering +UPDATE docs SET body = 'The revised document about the important topic' WHERE id = 1; +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('the'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('revised'); +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('important'); + +--echo # DELETE and verify +DELETE FROM docs WHERE id = 2; +SELECT COUNT(*) FROM docs WHERE MATCH(body) AGAINST('dog'); + +DROP TABLE docs; diff --git a/mysql-test/suite/tidesdb/t/tidesdb_fulltext.test b/mysql-test/suite/tidesdb/t/tidesdb_fulltext.test new file mode 100644 index 0000000000000..a657dd0b9362c --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_fulltext.test @@ -0,0 +1,141 @@ +--source include/have_tidesdb.inc +# +# Test: Full-text search (FULLTEXT indexes with BM25 ranking) +# +# Covers: +# 1. CREATE TABLE with FULLTEXT index +# 2. Natural language mode search with BM25 ranking +# 3. Boolean mode: required (+), excluded (-), optional terms +# 4. Boolean mode: prefix wildcard (term*) +# 5. Multi-column FULLTEXT index +# 6. UPDATE updates FTS index correctly +# 7. DELETE removes FTS entries correctly +# 8. No-match queries return empty result +# + +--echo # +--echo # Setup +--echo # + +CREATE TABLE articles ( + id INT NOT NULL PRIMARY KEY, + title VARCHAR(200), + body TEXT, + FULLTEXT ft_content (title, body) +) ENGINE=TidesDB; + +INSERT INTO articles VALUES (1, 'MySQL Tutorial', 'DBMS stands for DataBase Management System'); +INSERT INTO articles VALUES (2, 'How To Use MySQL', 'After you went through a tutorial you can start'); +INSERT INTO articles VALUES (3, 'Optimizing MySQL', 'In this tutorial we show optimization techniques'); +INSERT INTO articles VALUES (4, 'TidesDB Guide', 'TidesDB is an LSM tree storage engine'); +INSERT INTO articles VALUES (5, 'Database Systems', 'A database management system manages data efficiently'); + +--echo # +--echo # TEST 1: Natural language search +--echo # + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('tutorial') +ORDER BY MATCH(title, body) AGAINST('tutorial') DESC; + +--echo # +--echo # TEST 2: Multi-term natural language search +--echo # + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('database management') +ORDER BY MATCH(title, body) AGAINST('database management') DESC; + +--echo # +--echo # TEST 3: No match returns empty +--echo # + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('nonexistent'); + +--echo # +--echo # TEST 4: Boolean mode - required term +--echo # + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('+mysql +tutorial' IN BOOLEAN MODE) +ORDER BY id; + +--echo # +--echo # TEST 5: Boolean mode - excluded term +--echo # + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('+mysql -tutorial' IN BOOLEAN MODE) +ORDER BY id; + +--echo # +--echo # TEST 6: Boolean mode - prefix wildcard +--echo # + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('optim*' IN BOOLEAN MODE) +ORDER BY id; + +--echo # +--echo # TEST 7: UPDATE changes FTS results +--echo # + +UPDATE articles SET body = 'This tutorial covers advanced optimization and tuning' WHERE id = 4; + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('tutorial') +ORDER BY MATCH(title, body) AGAINST('tutorial') DESC; + +--echo # +--echo # TEST 8: DELETE removes from FTS results +--echo # + +DELETE FROM articles WHERE id = 3; + +SELECT id, title FROM articles +WHERE MATCH(title, body) AGAINST('tutorial') +ORDER BY MATCH(title, body) AGAINST('tutorial') DESC; + +--echo # +--echo # TEST 9: Single-column FULLTEXT index +--echo # + +DROP TABLE articles; +CREATE TABLE articles ( + id INT NOT NULL PRIMARY KEY, + title VARCHAR(200), + FULLTEXT (title) +) ENGINE=TidesDB; + +INSERT INTO articles VALUES (1, 'Introduction to MySQL'); +INSERT INTO articles VALUES (2, 'Advanced PostgreSQL'); +INSERT INTO articles VALUES (3, 'MySQL Performance Tuning'); + +SELECT id, title FROM articles +WHERE MATCH(title) AGAINST('mysql') +ORDER BY MATCH(title) AGAINST('mysql') DESC; + +--echo # +--echo # TEST 10: Oversize query terms must not overflow the stack key buffer. +--echo # fts_build_key truncates inserted keys to 512 bytes, but a user can pass +--echo # a multi-byte search term whose byte length exceeds the on-disk cap. +--echo # The query must complete without crashing and return no match. +--echo # + +# 1024 ASCII characters -> 1024 bytes, double the FTS_MAX_TERM_BYTES cap. +SELECT id, title FROM articles +WHERE MATCH(title) AGAINST(REPEAT('a', 1024) IN BOOLEAN MODE); + +# Wildcard variant exercises the per-length seek path. +SELECT id, title FROM articles +WHERE MATCH(title) AGAINST(CONCAT(REPEAT('a', 1024), '*') IN BOOLEAN MODE); + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE articles; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_fulltext_phrase.test b/mysql-test/suite/tidesdb/t/tidesdb_fulltext_phrase.test new file mode 100644 index 0000000000000..27d6dea622c7e --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_fulltext_phrase.test @@ -0,0 +1,98 @@ +--source include/have_tidesdb.inc +# +# Test: FTS phrase queries and wildcard edge cases +# + +--echo # +--echo # Setup +--echo # + +CREATE TABLE docs ( + id INT NOT NULL PRIMARY KEY, + body TEXT, + FULLTEXT (body) +) ENGINE=TidesDB; + +INSERT INTO docs VALUES (1, 'the quick brown fox jumps over the lazy dog'); +INSERT INTO docs VALUES (2, 'quick fox and lazy dog play together'); +INSERT INTO docs VALUES (3, 'the brown dog is not lazy at all'); +INSERT INTO docs VALUES (4, 'completely unrelated content here'); +INSERT INTO docs VALUES (5, 'the fox is quick and the dog is lazy'); + +--echo # +--echo # TEST 1: Exact phrase match +--echo # + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"quick brown fox"' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 2: Phrase appears in multiple rows +--echo # + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"lazy dog"' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 3: Phrase with wrong word order (no match) +--echo # + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"fox quick"' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 4: Phrase + required term +--echo # + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('+"lazy dog" +fox' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 5: Phrase + excluded term +--echo # + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('+"lazy dog" -quick' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 6: Wildcard with multiple matching lengths +--echo # + +DROP TABLE docs; +CREATE TABLE docs (id INT PRIMARY KEY, body TEXT, FULLTEXT(body)) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'optimization techniques are important'); +INSERT INTO docs VALUES (2, 'optimizing queries is essential'); +INSERT INTO docs VALUES (3, 'the optimal solution exists'); +INSERT INTO docs VALUES (4, 'nothing related here'); + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('optim*' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 7: Wildcard with short prefix +--echo # + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('opt*' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # TEST 8: Two-word phrase +--echo # + +DROP TABLE docs; +CREATE TABLE docs (id INT PRIMARY KEY, body TEXT, FULLTEXT(body)) ENGINE=TidesDB; +INSERT INTO docs VALUES (1, 'database management system'); +INSERT INTO docs VALUES (2, 'management of databases'); +INSERT INTO docs VALUES (3, 'the database has good management'); + +SELECT id FROM docs +WHERE MATCH(body) AGAINST('"database management"' IN BOOLEAN MODE) ORDER BY id; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE docs; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_hidden_pk.test b/mysql-test/suite/tidesdb/t/tidesdb_hidden_pk.test new file mode 100644 index 0000000000000..f29885866b273 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_hidden_pk.test @@ -0,0 +1,65 @@ +--source include/have_tidesdb.inc +# +# Test: Tables without explicit PRIMARY KEY (hidden auto-generated row ID) +# + +--echo # +--echo # TEST 1: Basic CRUD without PK +--echo # + +CREATE TABLE t_nopk (a INT, b VARCHAR(100)) ENGINE=TidesDB; + +INSERT INTO t_nopk VALUES (1, 'one'), (2, 'two'), (3, 'three'); +INSERT INTO t_nopk VALUES (1, 'duplicate_a'); + +SELECT * FROM t_nopk ORDER BY a, b; + +--echo # +--echo # TEST 2: UPDATE and DELETE without PK +--echo # + +UPDATE t_nopk SET b = 'UPDATED' WHERE a = 2; +SELECT * FROM t_nopk WHERE a = 2; + +DELETE FROM t_nopk WHERE b = 'duplicate_a'; +SELECT * FROM t_nopk ORDER BY a; + +--echo # +--echo # TEST 3: Hidden PK with secondary index +--echo # + +CREATE TABLE t_nopk_idx (x INT, y INT, KEY(x)) ENGINE=TidesDB; +INSERT INTO t_nopk_idx VALUES (10, 100), (20, 200), (10, 300), (30, 400); + +SELECT y FROM t_nopk_idx WHERE x = 10 ORDER BY y; +SELECT COUNT(*) FROM t_nopk_idx; + +--echo # +--echo # TEST 4: Hidden PK with BLOB +--echo # + +CREATE TABLE t_nopk_blob (data LONGBLOB, tag VARCHAR(20)) ENGINE=TidesDB; +INSERT INTO t_nopk_blob VALUES (REPEAT('X', 50000), 'big'); +INSERT INTO t_nopk_blob VALUES (REPEAT('Y', 100), 'small'); + +SELECT tag, LENGTH(data) FROM t_nopk_blob ORDER BY tag; + +UPDATE t_nopk_blob SET data = REPEAT('Z', 60000) WHERE tag = 'big'; +SELECT tag, LENGTH(data) FROM t_nopk_blob WHERE tag = 'big'; + +--echo # +--echo # TEST 5: TRUNCATE hidden PK table +--echo # + +TRUNCATE TABLE t_nopk; +INSERT INTO t_nopk VALUES (10, 'after_truncate'); +SELECT * FROM t_nopk; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_nopk, t_nopk_idx, t_nopk_blob; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_index_stats.test b/mysql-test/suite/tidesdb/t/tidesdb_index_stats.test new file mode 100644 index 0000000000000..cfbdf750d90f2 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_index_stats.test @@ -0,0 +1,128 @@ +--source include/have_tidesdb.inc +# +# Tests for issue #78 (index_type reporting) and issue #74 (wrong statistics). +# + +--echo # +--echo # ============================================ +--echo # TEST 1: Index type reporting (issue #78) +--echo # LSM tables should show LSM, not BTREE +--echo # ============================================ +--echo # + +CREATE TABLE t_lsm ( + i INT NOT NULL PRIMARY KEY, + y INT, + KEY idx_y (y) +) ENGINE=TIDESDB USE_BTREE=0; + +SHOW KEYS FROM t_lsm; + +DROP TABLE t_lsm; + + +--echo # +--echo # ============================================ +--echo # TEST 2: BTREE tables should show BTREE +--echo # ============================================ +--echo # + +CREATE TABLE t_btree ( + i INT NOT NULL PRIMARY KEY, + y INT, + KEY idx_y (y) +) ENGINE=TIDESDB USE_BTREE=1; + +SHOW KEYS FROM t_btree; + +DROP TABLE t_btree; + + +--echo # +--echo # ============================================ +--echo # TEST 3: Default (USE_BTREE=0) shows LSM +--echo # ============================================ +--echo # + +CREATE TABLE t_default ( + i INT NOT NULL PRIMARY KEY, + y INT, + KEY idx_y (y) +) ENGINE=TIDESDB; + +SHOW KEYS FROM t_default; + +DROP TABLE t_default; + + +--echo # +--echo # ============================================ +--echo # TEST 4: ANALYZE TABLE updates rec_per_key +--echo # for non-unique secondary indexes (issue #74) +--echo # ============================================ +--echo # + +CREATE TABLE t_stats ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + k INT NOT NULL, + val VARCHAR(50), + KEY k_idx (k) +) ENGINE=TIDESDB; + +--echo # Insert 200 rows with only 2 distinct values for k +--disable_query_log +let $i = 1; +while ($i <= 200) +{ + eval INSERT INTO t_stats (k, val) VALUES ($i % 2, REPEAT('x', 20)); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS total_rows FROM t_stats; + +--echo # Before ANALYZE, optimizer may not estimate well +EXPLAIN SELECT * FROM t_stats WHERE k = 0; + +--replace_regex /total_keys=[0-9]+/total_keys=N/ /data_size=[0-9]+/data_size=N/ /memtable=[0-9]+/memtable=N/ /read_amp=[0-9.]+/read_amp=N/ /cache_hit=[0-9.]+/cache_hit=N/ /avg_key=[0-9.]+/avg_key=N/ /avg_value=[0-9.]+/avg_value=N/ /sstables=[0-9]+/sstables=N/ /size=[0-9]+/size=N/ /keys=[0-9]+/keys=N/ /sampled=[0-9]+/sampled=N/ /distinct=[0-9]+/distinct=N/ /rec_per_key=[0-9]+/rec_per_key=N/ +ANALYZE TABLE t_stats; + +--echo # After ANALYZE, the optimizer should estimate ~100 rows for k=0 +EXPLAIN SELECT * FROM t_stats WHERE k = 0; + +DROP TABLE t_stats; + + +--echo # +--echo # ============================================ +--echo # TEST 5: ANALYZE with highly selective index +--echo # ============================================ +--echo # + +CREATE TABLE t_stats2 ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + code INT NOT NULL, + KEY code_idx (code) +) ENGINE=TIDESDB; + +--disable_query_log +let $i = 1; +while ($i <= 100) +{ + eval INSERT INTO t_stats2 (code) VALUES ($i); + inc $i; +} +--enable_query_log + +--replace_regex /total_keys=[0-9]+/total_keys=N/ /data_size=[0-9]+/data_size=N/ /memtable=[0-9]+/memtable=N/ /read_amp=[0-9.]+/read_amp=N/ /cache_hit=[0-9.]+/cache_hit=N/ /avg_key=[0-9.]+/avg_key=N/ /avg_value=[0-9.]+/avg_value=N/ /sstables=[0-9]+/sstables=N/ /size=[0-9]+/size=N/ /keys=[0-9]+/keys=N/ /sampled=[0-9]+/sampled=N/ /distinct=[0-9]+/distinct=N/ /rec_per_key=[0-9]+/rec_per_key=N/ +ANALYZE TABLE t_stats2; + +--echo # With 100 distinct values in 100 rows, rec_per_key should be ~1 +EXPLAIN SELECT * FROM t_stats2 WHERE code = 50; + +DROP TABLE t_stats2; + + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_info_schema.test b/mysql-test/suite/tidesdb/t/tidesdb_info_schema.test new file mode 100644 index 0000000000000..0200636954591 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_info_schema.test @@ -0,0 +1,132 @@ +--source include/have_tidesdb.inc +# Suppress version-specific diagnostic warnings (e.g. 4202 max_sort_length on +# newer servers when scanning information_schema) that are not part of what +# this test verifies, so the result is identical across MariaDB versions. +--disable_warnings +# +# TidesDB information_schema.TABLES size reporting +# Verify DATA_LENGTH and INDEX_LENGTH are non-zero after inserts +# + +--echo # ---- setup ---- +CREATE TABLE t_info_schema ( + id INT PRIMARY KEY, + val VARCHAR(200) +) ENGINE=TidesDB; + +INSERT INTO t_info_schema VALUES (1, REPEAT('a', 100)); +INSERT INTO t_info_schema VALUES (2, REPEAT('b', 100)); +INSERT INTO t_info_schema VALUES (3, REPEAT('c', 100)); + +--echo # ---- data_length must be non-zero ---- +let $data_len = `SELECT DATA_LENGTH FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if (!$data_len) +{ + --echo FAIL: DATA_LENGTH is 0 +} +if ($data_len) +{ + --echo OK: DATA_LENGTH > 0 +} + +--echo # ---- table_rows must reflect inserted rows ---- +let $rows = `SELECT TABLE_ROWS FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if ($rows < 3) +{ + --echo FAIL: TABLE_ROWS < 3 +} +if ($rows >= 3) +{ + --echo OK: TABLE_ROWS >= 3 +} + +--echo # ---- add secondary index and check index_length ---- +ALTER TABLE t_info_schema ADD INDEX idx_val (val); + +# force stats refresh (2s cache) +--sleep 3 + +# touch the table so info() is called fresh +SELECT COUNT(*) FROM t_info_schema; + +let $idx_len = `SELECT INDEX_LENGTH FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if (!$idx_len) +{ + --echo FAIL: INDEX_LENGTH is 0 +} +if ($idx_len) +{ + --echo OK: INDEX_LENGTH > 0 +} + +--echo # ---- verify after bulk insert ---- +--disable_query_log +let $i = 4; +while ($i <= 200) +{ + eval INSERT INTO t_info_schema VALUES ($i, REPEAT('x', 100)); + inc $i; +} +--enable_query_log + +--sleep 3 +SELECT COUNT(*) FROM t_info_schema; + +let $data_len2 = `SELECT DATA_LENGTH FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if (!$data_len2) +{ + --echo FAIL: DATA_LENGTH is 0 after bulk insert +} +if ($data_len2) +{ + --echo OK: DATA_LENGTH > 0 after bulk insert +} + +--echo # ---- create_time must be non-null ---- +let $ct = `SELECT CREATE_TIME FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if (!$ct) +{ + --echo FAIL: CREATE_TIME is NULL +} +if ($ct) +{ + --echo OK: CREATE_TIME is set +} + +--echo # ---- update_time must be non-null after DML ---- +let $ut = `SELECT UPDATE_TIME FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if (!$ut) +{ + --echo FAIL: UPDATE_TIME is NULL +} +if ($ut) +{ + --echo OK: UPDATE_TIME is set +} + +--echo # ---- update_time advances after more DML ---- +let $ut1 = `SELECT UNIX_TIMESTAMP(UPDATE_TIME) FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +--sleep 2 +INSERT INTO t_info_schema VALUES (9999, 'timestamp_test'); +let $ut2 = `SELECT UNIX_TIMESTAMP(UPDATE_TIME) FROM information_schema.TABLES + WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 't_info_schema'`; +if ($ut2 >= $ut1) +{ + --echo OK: UPDATE_TIME advanced after INSERT +} +if ($ut2 < $ut1) +{ + --echo FAIL: UPDATE_TIME did not advance +} + +--enable_warnings +--echo # ---- cleanup ---- +DROP TABLE t_info_schema; +--source suite/tidesdb/include/cleanup_tidesdb.inc diff --git a/mysql-test/suite/tidesdb/t/tidesdb_insert_conflict.opt b/mysql-test/suite/tidesdb/t/tidesdb_insert_conflict.opt new file mode 100644 index 0000000000000..4fa69806a64ba --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_insert_conflict.opt @@ -0,0 +1 @@ +--tidesdb-pessimistic-locking=OFF diff --git a/mysql-test/suite/tidesdb/t/tidesdb_insert_conflict.test b/mysql-test/suite/tidesdb/t/tidesdb_insert_conflict.test new file mode 100644 index 0000000000000..934e06a47f822 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_insert_conflict.test @@ -0,0 +1,55 @@ +--source include/have_tidesdb.inc +# +# Issue #83: INSERT vs INSERT conflict detection. +# Two concurrent transactions inserting the same PK should conflict. +# The second committer should get ER_LOCK_DEADLOCK (TDB_ERR_CONFLICT). +# +# NOTE: This test requires TidesDB library fix for INSERT-INSERT +# conflict detection. If it fails, the library may need updating. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +--echo # +--echo # Issue #83: INSERT vs INSERT conflict detection +--echo # + +CREATE TABLE t ( + a INT NOT NULL PRIMARY KEY, + b INT +) ENGINE=TidesDB; + +connect (con1, localhost, root,,); +connect (con2, localhost, root,,); + +--echo # ---- TEST: Two INSERTs with same PK ---- +connection con1; +START TRANSACTION; +INSERT INTO t VALUES (1, 10); + +connection con2; +START TRANSACTION; +INSERT INTO t VALUES (1, 500); +COMMIT; + +connection con1; +--echo # con1 should get conflict error -- con2 committed first +--error ER_LOCK_DEADLOCK,ER_ERROR_DURING_COMMIT +COMMIT; + +connection default; +--echo # con2 wins: b should be 500 +SELECT * FROM t; + +--echo # Cleanup +connection con1; +disconnect con1; +connection con2; +disconnect con2; +connection default; + +DROP TABLE t; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_isolation.test b/mysql-test/suite/tidesdb/t/tidesdb_isolation.test new file mode 100644 index 0000000000000..11fe611880235 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_isolation.test @@ -0,0 +1,126 @@ +--source include/have_tidesdb.inc +# +# Tests for session-level isolation level mapping. +# Verifies that SET TRANSACTION ISOLATION LEVEL is properly +# respected by the TidesDB engine (resolve_effective_isolation). +# + +--echo # +--echo # ============================================ +--echo # TEST 1: READ COMMITTED - sees committed data +--echo # ============================================ +--echo # + +CREATE TABLE t_iso ( + id INT NOT NULL PRIMARY KEY, + val INT +) ENGINE=TIDESDB; + +INSERT INTO t_iso VALUES (1, 10); + +connect (con1, localhost, root,,); +connection con1; +SET TRANSACTION ISOLATION LEVEL READ COMMITTED; +BEGIN; +SELECT * FROM t_iso ORDER BY id; + +connection default; +INSERT INTO t_iso VALUES (2, 20); + +--echo # con1 at READ COMMITTED should see newly committed row +connection con1; +SELECT * FROM t_iso ORDER BY id; +COMMIT; + +disconnect con1; +connection default; + + +--echo # +--echo # ============================================ +--echo # TEST 2: REPEATABLE READ - snapshot isolation +--echo # ============================================ +--echo # + +connect (con2, localhost, root,,); +connection con2; +SET TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +SELECT * FROM t_iso ORDER BY id; + +connection default; +INSERT INTO t_iso VALUES (3, 30); + +--echo # con2 at REPEATABLE READ should NOT see row 3 +connection con2; +SELECT * FROM t_iso ORDER BY id; +COMMIT; + +--echo # After COMMIT, new transaction should see row 3 +SELECT * FROM t_iso ORDER BY id; + +disconnect con2; +connection default; + + +--echo # +--echo # ============================================ +--echo # TEST 3: Basic DML at each isolation level +--echo # (verifies the mapping doesn't crash) +--echo # ============================================ +--echo # + +SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; +INSERT INTO t_iso VALUES (4, 40); +SELECT * FROM t_iso WHERE id = 4; + +SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED; +UPDATE t_iso SET val = 41 WHERE id = 4; +SELECT * FROM t_iso WHERE id = 4; + +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +UPDATE t_iso SET val = 42 WHERE id = 4; +SELECT * FROM t_iso WHERE id = 4; + +SET SESSION TRANSACTION ISOLATION LEVEL SERIALIZABLE; +DELETE FROM t_iso WHERE id = 4; +SELECT * FROM t_iso ORDER BY id; + +--echo # Reset to default +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; + +DROP TABLE t_iso; + + +--echo # +--echo # ============================================ +--echo # TEST 4: SNAPSHOT isolation via table option +--echo # (table uses ISOLATION_LEVEL=SNAPSHOT, session +--echo # at REPEATABLE READ should activate SNAPSHOT) +--echo # ============================================ +--echo # + +CREATE TABLE t_snap ( + id INT NOT NULL PRIMARY KEY, + val INT +) ENGINE=TIDESDB ISOLATION_LEVEL='SNAPSHOT'; + +INSERT INTO t_snap VALUES (1, 100); + +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; +BEGIN; +SELECT * FROM t_snap ORDER BY id; + +# Insert from same connection (different statement in same txn) +# The BEGIN already took a snapshot, so this tests +# that writes within the txn are visible to reads +INSERT INTO t_snap VALUES (2, 200); +SELECT * FROM t_snap ORDER BY id; +COMMIT; + +DROP TABLE t_snap; + + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_isolation_table_option.opt b/mysql-test/suite/tidesdb/t/tidesdb_isolation_table_option.opt new file mode 100644 index 0000000000000..f983decab2ed3 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_isolation_table_option.opt @@ -0,0 +1,2 @@ +--loose-tidesdb-unified-memtable-sync-mode=NONE +--loose-tidesdb-default-sync-mode=NONE diff --git a/mysql-test/suite/tidesdb/t/tidesdb_isolation_table_option.test b/mysql-test/suite/tidesdb/t/tidesdb_isolation_table_option.test new file mode 100644 index 0000000000000..c7065b74bf081 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_isolation_table_option.test @@ -0,0 +1,51 @@ +--source include/have_tidesdb.inc +# +# A table-level ISOLATION_LEVEL option is honored when the session is at +# the SQL default of REPEATABLE READ. A table that leaves the option at +# the default resolves to TidesDB SNAPSHOT (InnoDB parity) and holds one +# stable snapshot for the whole transaction. A table created with +# ISOLATION_LEVEL=READ_COMMITTED sees rows committed by other sessions +# mid-transaction. +# + +CREATE TABLE t_snap (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_snap VALUES (1, 10); + +CREATE TABLE t_rc (id INT PRIMARY KEY, v INT) + ENGINE=TidesDB `ISOLATION_LEVEL`=READ_COMMITTED; +INSERT INTO t_rc VALUES (1, 10); + +connect (con1, localhost, root,,test); +connection con1; +SET SESSION TRANSACTION ISOLATION LEVEL REPEATABLE READ; + +--echo # +--echo # default table -- the transaction holds a stable snapshot +--echo # +BEGIN; +SELECT id, v FROM t_snap ORDER BY id; +connection default; +INSERT INTO t_snap VALUES (2, 20); +connection con1; +--echo # the snapshot is stable, so the row committed afterwards is unseen +SELECT id, v FROM t_snap ORDER BY id; +COMMIT; + +--echo # +--echo # ISOLATION_LEVEL=READ_COMMITTED -- the transaction sees fresh commits +--echo # +BEGIN; +SELECT id, v FROM t_rc ORDER BY id; +connection default; +INSERT INTO t_rc VALUES (2, 20); +connection con1; +--echo # read committed sees the row committed after the transaction began +SELECT id, v FROM t_rc ORDER BY id; +COMMIT; + +connection default; +disconnect con1; +DROP TABLE t_snap, t_rc; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_json.opt b/mysql-test/suite/tidesdb/t/tidesdb_json.opt new file mode 100644 index 0000000000000..2082352df066f --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_json.opt @@ -0,0 +1 @@ +--loose-tidesdb-json-test=1 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_json.test b/mysql-test/suite/tidesdb/t/tidesdb_json.test new file mode 100644 index 0000000000000..f1bc025fe4339 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_json.test @@ -0,0 +1,46 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc + +--echo # +--echo # ============================================ +--echo # TEST: JSON querying + generated column indexing +--echo # ============================================ +--echo # + +CREATE TABLE t_json ( + id INT NOT NULL PRIMARY KEY, + data LONGTEXT, + name VARCHAR(50) AS (JSON_VALUE(data, '$.name')) PERSISTENT, + age INT AS (JSON_VALUE(data, '$.age')) PERSISTENT, + KEY idx_name (name), + KEY idx_age (age) +) ENGINE=TIDESDB; + +INSERT INTO t_json (id, data) VALUES + (1, '{"name":"Alice","age":30,"tags":["admin","dev"]}'), + (2, '{"name":"Bob","age":25,"tags":["dev"]}'), + (3, '{"name":"Carol","age":40,"tags":["finance"]}'); + +--echo # Basic JSON extraction +SELECT id, JSON_VALUE(data, '$.name') AS jname, JSON_VALUE(data, '$.age') AS jage +FROM t_json ORDER BY id; + +--echo # Generated columns reflect JSON paths +SELECT id, name, age FROM t_json ORDER BY id; + +--echo # Filter using generated columns (indexable JSON paths) +SELECT id, name, age FROM t_json WHERE name='Alice' ORDER BY id; +SELECT id, name, age FROM t_json WHERE age >= 30 ORDER BY id; + +--echo # Filter using JSON function (non-indexed expression) +SELECT id FROM t_json WHERE JSON_CONTAINS(data, '"admin"', '$.tags') ORDER BY id; + +--echo # Update JSON and verify generated columns update +UPDATE t_json SET data = JSON_SET(data, '$.age', 31) WHERE id = 1; +SELECT id, name, age FROM t_json WHERE id = 1; + +DROP TABLE t_json; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_large_blob.test b/mysql-test/suite/tidesdb/t/tidesdb_large_blob.test new file mode 100644 index 0000000000000..da3353d5773ed --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_large_blob.test @@ -0,0 +1,71 @@ +--source include/have_tidesdb.inc +# +# Test: Large BLOB/TEXT values (> 64KB, exercising klog_value_threshold) +# + +--echo # +--echo # TEST 1: Large TEXT insert and retrieval +--echo # + +CREATE TABLE t_blob (id INT PRIMARY KEY, data LONGTEXT) ENGINE=TidesDB; + +INSERT INTO t_blob VALUES (1, REPEAT('A', 1000)); +INSERT INTO t_blob VALUES (2, REPEAT('B', 65536)); +INSERT INTO t_blob VALUES (3, REPEAT('C', 262144)); + +SELECT id, LENGTH(data) FROM t_blob ORDER BY id; + +--echo # +--echo # TEST 2: Large BLOB with secondary index +--echo # + +CREATE TABLE t_blob_idx ( + id INT PRIMARY KEY, + cat INT, + payload LONGBLOB, + KEY(cat) +) ENGINE=TidesDB; + +INSERT INTO t_blob_idx VALUES (1, 10, REPEAT('X', 100000)); +INSERT INTO t_blob_idx VALUES (2, 20, REPEAT('Y', 100000)); +INSERT INTO t_blob_idx VALUES (3, 10, REPEAT('Z', 100000)); + +SELECT id, LENGTH(payload) FROM t_blob_idx WHERE cat = 10 ORDER BY id; + +--echo # +--echo # TEST 3: UPDATE large BLOB +--echo # + +UPDATE t_blob SET data = REPEAT('D', 500000) WHERE id = 2; +SELECT id, LENGTH(data) FROM t_blob WHERE id = 2; + +--echo # +--echo # TEST 4: DELETE and re-insert large BLOB +--echo # + +DELETE FROM t_blob WHERE id = 3; +INSERT INTO t_blob VALUES (3, REPEAT('E', 131072)); +SELECT id, LENGTH(data) FROM t_blob ORDER BY id; + +--echo # +--echo # TEST 5: Multiple BLOB columns +--echo # + +CREATE TABLE t_multi_blob ( + id INT PRIMARY KEY, + a LONGBLOB, + b LONGTEXT, + c MEDIUMBLOB +) ENGINE=TidesDB; + +INSERT INTO t_multi_blob VALUES (1, REPEAT('A', 80000), REPEAT('B', 80000), REPEAT('C', 40000)); +SELECT id, LENGTH(a), LENGTH(b), LENGTH(c) FROM t_multi_blob; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_blob, t_blob_idx, t_multi_blob; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_load_data.test b/mysql-test/suite/tidesdb/t/tidesdb_load_data.test new file mode 100644 index 0000000000000..f0519d89ec292 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_load_data.test @@ -0,0 +1,79 @@ +--source include/have_tidesdb.inc +# +# Test: Bulk insert path (start_bulk_insert / end_bulk_insert) +# and INSERT ... SELECT (which also uses the bulk insert hint) +# + +--echo # +--echo # TEST 1: Multi-row INSERT (triggers bulk insert path) +--echo # + +CREATE TABLE t_bulk (id INT PRIMARY KEY, name VARCHAR(100), val INT) ENGINE=TidesDB; + +INSERT INTO t_bulk VALUES + (1, 'alpha', 100), (2, 'beta', 200), (3, 'gamma', 300), + (4, 'delta', 400), (5, 'epsilon', 500); + +SELECT * FROM t_bulk ORDER BY id; + +--echo # +--echo # TEST 2: INSERT ... SELECT bulk load +--echo # + +CREATE TABLE t_source (id INT PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_source VALUES (1,'a'), (2,'b'), (3,'c'), (4,'d'), (5,'e'), + (6,'f'), (7,'g'), (8,'h'), (9,'i'), (10,'j'); + +CREATE TABLE t_dest (id INT PRIMARY KEY, v VARCHAR(50)) ENGINE=TidesDB; +INSERT INTO t_dest SELECT * FROM t_source; +SELECT COUNT(*) FROM t_dest; + +--echo # +--echo # TEST 3: Large bulk insert (200+ rows, triggers batch commit) +--echo # + +CREATE TABLE t_large (id INT PRIMARY KEY, payload VARCHAR(200)) ENGINE=TidesDB; + +--disable_query_log +let $i = 1; +while ($i <= 200) +{ + eval INSERT INTO t_large VALUES ($i, REPEAT('X', 100)); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS total FROM t_large; +SELECT MIN(id), MAX(id) FROM t_large; + +--echo # +--echo # TEST 4: Bulk insert with secondary index +--echo # + +CREATE TABLE t_bulk_idx (id INT PRIMARY KEY, cat INT, KEY(cat)) ENGINE=TidesDB; +INSERT INTO t_bulk_idx VALUES + (1, 10), (2, 20), (3, 10), (4, 30), (5, 10), + (6, 20), (7, 10), (8, 30), (9, 10), (10, 20); + +SELECT COUNT(*) FROM t_bulk_idx WHERE cat = 10; +SELECT COUNT(*) FROM t_bulk_idx WHERE cat = 20; + +--echo # +--echo # TEST 5: INSERT ... SELECT between TidesDB tables +--echo # + +CREATE TABLE t_src2 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_src2 VALUES (1,10), (2,20), (3,30); + +CREATE TABLE t_dst2 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_dst2 SELECT * FROM t_src2; +SELECT * FROM t_dst2 ORDER BY id; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_bulk, t_source, t_dest, t_large, t_bulk_idx, t_src2, t_dst2; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_max_concurrent_flushes.opt b/mysql-test/suite/tidesdb/t/tidesdb_max_concurrent_flushes.opt new file mode 100644 index 0000000000000..2faa85f74711f --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_max_concurrent_flushes.opt @@ -0,0 +1,2 @@ +--tidesdb-flush-threads=4 +--tidesdb-max-concurrent-flushes=2 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_max_concurrent_flushes.test b/mysql-test/suite/tidesdb/t/tidesdb_max_concurrent_flushes.test new file mode 100644 index 0000000000000..f12ffe3a3c9c6 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_max_concurrent_flushes.test @@ -0,0 +1,26 @@ +--source include/have_tidesdb.inc + +# This test deliberately starts the server with a misaligned configuration +# so it can assert on the resulting warning, so the MTR warning checker +# must be told to expect it. +call mtr.add_suppression("\\[TIDESDB\\] tidesdb_max_concurrent_flushes=.* is lower than tidesdb_flush_threads="); + +# +# tidesdb_max_concurrent_flushes caps in-flight memtable flushes. When 0 +# (default) the cap aligns with tidesdb_flush_threads so every configured +# worker can run. An explicit cap below the worker count is honoured but +# logs a startup warning since some flush workers will remain idle. +# The .opt for this test sets flush_threads=4 and max_concurrent_flushes=2 +# to exercise that warning path. +# + +SELECT @@global.tidesdb_flush_threads AS flush_threads, + @@global.tidesdb_max_concurrent_flushes AS max_concurrent_flushes; + +--echo # the server error log carries the misalignment warning +--let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err +--let SEARCH_PATTERN= tidesdb_max_concurrent_flushes=2 is lower than tidesdb_flush_threads=4 +--source include/search_pattern_in_file.inc + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_mixed_engine.test b/mysql-test/suite/tidesdb/t/tidesdb_mixed_engine.test new file mode 100644 index 0000000000000..55cf4f3014582 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_mixed_engine.test @@ -0,0 +1,76 @@ +--source include/have_tidesdb.inc +--source include/have_innodb.inc +# +# Test: Mixed-engine transactions (TidesDB + InnoDB in same transaction) +# + +--echo # +--echo # TEST 1: Cross-engine transaction commit +--echo # + +CREATE TABLE t_tdb (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +CREATE TABLE t_inn (id INT PRIMARY KEY, v INT) ENGINE=InnoDB; + +BEGIN; +INSERT INTO t_tdb VALUES (1, 100); +INSERT INTO t_inn VALUES (1, 100); +INSERT INTO t_tdb VALUES (2, 200); +INSERT INTO t_inn VALUES (2, 200); +COMMIT; + +SELECT * FROM t_tdb ORDER BY id; +SELECT * FROM t_inn ORDER BY id; + +--echo # +--echo # TEST 2: Cross-engine transaction rollback +--echo # + +BEGIN; +INSERT INTO t_tdb VALUES (3, 300); +INSERT INTO t_inn VALUES (3, 300); +ROLLBACK; + +SELECT COUNT(*) AS tdb_count FROM t_tdb; +SELECT COUNT(*) AS inn_count FROM t_inn; + +--echo # +--echo # TEST 3: Cross-engine JOIN query +--echo # + +INSERT INTO t_tdb VALUES (3, 300); +INSERT INTO t_inn VALUES (3, 999); + +SELECT a.id, a.v AS tdb_val, b.v AS inn_val +FROM t_tdb a JOIN t_inn b ON a.id = b.id +ORDER BY a.id; + +--echo # +--echo # TEST 4: INSERT ... SELECT across engines +--echo # + +CREATE TABLE t_tdb2 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t_tdb2 SELECT * FROM t_inn; +SELECT * FROM t_tdb2 ORDER BY id; + +CREATE TABLE t_inn2 (id INT PRIMARY KEY, v INT) ENGINE=InnoDB; +INSERT INTO t_inn2 SELECT * FROM t_tdb; +SELECT * FROM t_inn2 ORDER BY id; + +--echo # +--echo # TEST 5: Multi-table UPDATE across engines +--echo # + +UPDATE t_tdb a JOIN t_inn b ON a.id = b.id +SET a.v = a.v + 1, b.v = b.v + 1 +WHERE a.id = 1; + +SELECT a.v AS tdb_v, b.v AS inn_v FROM t_tdb a, t_inn b WHERE a.id = 1 AND b.id = 1; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_tdb, t_inn, t_tdb2, t_inn2; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_mrr.test b/mysql-test/suite/tidesdb/t/tidesdb_mrr.test new file mode 100644 index 0000000000000..23d4f38626c74 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_mrr.test @@ -0,0 +1,89 @@ +--source include/have_tidesdb.inc +# +# Multi-Range Read (MRR) for TidesDB. +# +# Exercises the custom MRR path that batches and sorts point lookups from +# WHERE col IN (...) style queries. Verifies correctness with and without +# the optimizer_switch that forces MRR, across PK and secondary indexes, +# and on a large IN-list where the sort matters for locality. +# + +SET @saved_opt_switch = @@optimizer_switch; +SET optimizer_switch = 'mrr=on,mrr_sort_keys=on,mrr_cost_based=off'; + +--echo # +--echo # TEST 1: IN (...) on PK (clustered-style point lookups) +--echo # + +CREATE TABLE t_pk (id INT PRIMARY KEY, v VARCHAR(20)) ENGINE=TidesDB; +INSERT INTO t_pk VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e'), + (6,'f'),(7,'g'),(8,'h'),(9,'i'),(10,'j'); + +--echo # Confirm the optimizer actually picks Rowid-ordered scan (MRR). +--replace_column 6 # +EXPLAIN SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5); + +--echo # Unsorted IN-list; MRR must still return the right rows. +SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5) ORDER BY id; + +--echo # Mix of hits and misses -- missing IDs are silently skipped. +SELECT * FROM t_pk WHERE id IN (11, 4, 99, 1, 42) ORDER BY id; + +--echo # Single-element IN is still routed through MRR. +SELECT * FROM t_pk WHERE id IN (6); + +--echo # +--echo # TEST 2: IN (...) on a unique secondary index +--echo # + +CREATE TABLE t_uk ( + id INT PRIMARY KEY, + code INT, + v VARCHAR(20), + UNIQUE KEY u_code (code) +) ENGINE=TidesDB; +INSERT INTO t_uk VALUES (1,100,'a'),(2,200,'b'),(3,300,'c'),(4,400,'d'),(5,500,'e'); + +SELECT * FROM t_uk WHERE code IN (300, 100, 500) ORDER BY code; +SELECT * FROM t_uk WHERE code IN (999, 200, 111) ORDER BY code; + +--echo # +--echo # TEST 3: Large unsorted IN-list (sort-then-seek should still be correct) +--echo # + +CREATE TABLE t_big (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +--disable_query_log +let $i = 1; +while ($i <= 200) +{ + eval INSERT INTO t_big VALUES ($i, $i * 10); + inc $i; +} +--enable_query_log + +SELECT COUNT(*), MIN(id), MAX(id) FROM t_big + WHERE id IN (37, 199, 2, 88, 150, 1, 73, 112, 200, 5); + +--echo # EXPLAIN should mention MRR in Extra for a 10-value IN on a 200-row table. +--replace_column 6 # 9 # +EXPLAIN SELECT * FROM t_big + WHERE id IN (37, 199, 2, 88, 150, 1, 73, 112, 200, 5); + +--echo # +--echo # TEST 4: Result is consistent with / without MRR +--echo # + +SET optimizer_switch = 'mrr=off'; +SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5) ORDER BY id; +SET optimizer_switch = 'mrr=on,mrr_sort_keys=on,mrr_cost_based=off'; +SELECT * FROM t_pk WHERE id IN (7, 2, 9, 3, 5) ORDER BY id; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_pk, t_uk, t_big; +SET optimizer_switch = @saved_opt_switch; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_mvcc_concurrent_update.test b/mysql-test/suite/tidesdb/t/tidesdb_mvcc_concurrent_update.test new file mode 100644 index 0000000000000..358b98c8fd259 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_mvcc_concurrent_update.test @@ -0,0 +1,87 @@ +--source include/have_tidesdb.inc + +# +# Concurrent same-row update test for TidesDB optimistic MVCC. +# +# Drives 8 concurrent client processes against a single counter row, +# each doing 3000 iterations of a New-Order-shaped transaction: +# +# BEGIN; +# SELECT d_next_o_id ... FOR UPDATE; +# UPDATE district SET d_next_o_id = d_next_o_id + 1 ...; +# INSERT INTO txn_log (vu, ts) VALUES (vu, ts); +# COMMIT; +# +# The txn_log INSERT shares the transaction. Each transaction either +# commits both writes or rolls back both, so COUNT(*) FROM txn_log +# equals the number of committed transactions. The invariant the +# engine must preserve is: +# +# d_next_o_id - 3001 == COUNT(*) FROM txn_log +# +# When the invariant fails the engine acknowledged commits whose +# UPDATE writes collapsed onto the same target value -- two +# transactions read the same snapshot, both wrote read+1, both +# committed without TDB_ERR_CONFLICT firing. The test passes +# deterministically (OK) when MVCC conflict detection serialises the +# concurrent increments correctly. +# + +call mtr.add_suppression("\\[TIDESDB\\].*hton_commit: tidesdb_txn_commit returned"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +--disable_warnings +DROP TABLE IF EXISTS district; +DROP TABLE IF EXISTS txn_log; +--enable_warnings + +CREATE TABLE district ( + d_w_id INT NOT NULL, + d_id INT NOT NULL, + d_next_o_id INT NOT NULL, + PRIMARY KEY (d_w_id, d_id) +) ENGINE=TidesDB; + +CREATE TABLE txn_log ( + id BIGINT NOT NULL AUTO_INCREMENT, + vu INT NOT NULL, + ts BIGINT NOT NULL, + PRIMARY KEY (id) +) ENGINE=TidesDB; + +INSERT INTO district VALUES (1, 1, 3001); + +# 8 parallel client processes, each running 3000 increment transactions +# on the same row. Brace expansion in bash keeps MTR's $-substitution +# from rewriting the loop variables. Worker stderr is silenced because +# TDB_ERR_CONFLICT rollbacks are expected and would otherwise flood +# the test output. The vu number is interpolated into the txn_log +# INSERT so the row counts are attributable per-worker if needed. +--exec bash -c "for vu in {1..8}; do (for i in {1..3000}; do $MYSQL --no-defaults --socket=$MASTER_MYSOCK -uroot test -e \"BEGIN; SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1 FOR UPDATE; UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; INSERT INTO txn_log (vu, ts) VALUES (\$vu, UNIX_TIMESTAMP()); COMMIT;\" >/dev/null 2>&1; done) & done; wait" + +# Compute the invariant. With correct optimistic MVCC every committed +# transaction increments d_next_o_id by exactly 1 and lands one +# txn_log row. A lost update commits both writes but collapses the +# UPDATE result, so commits_logged ends up greater than counter_delta. +# +# The absolute counts vary by conflict rate so we mask them; only the +# verdict column is asserted by the .result file. OK means delta == +# commits_logged. LOST_UPDATE / PHANTOM_INCREMENT diff against OK +# and fail the test, with the gap visible in the failure diff. +--replace_column 1 # 2 # +SELECT + d_next_o_id - 3001 AS counter_delta, + (SELECT COUNT(*) FROM txn_log) AS commits_logged, + CASE + WHEN d_next_o_id - 3001 = (SELECT COUNT(*) FROM txn_log) + THEN 'OK' + WHEN d_next_o_id - 3001 < (SELECT COUNT(*) FROM txn_log) + THEN 'LOST_UPDATE' + ELSE 'PHANTOM_INCREMENT' + END AS verdict +FROM district WHERE d_w_id=1 AND d_id=1; + +DROP TABLE district; +DROP TABLE txn_log; + +--source suite/tidesdb/include/cleanup_tidesdb.inc diff --git a/mysql-test/suite/tidesdb/t/tidesdb_object_store.test b/mysql-test/suite/tidesdb/t/tidesdb_object_store.test new file mode 100644 index 0000000000000..b50b32d94d42b --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_object_store.test @@ -0,0 +1,111 @@ +--source include/have_tidesdb.inc +# +# Test: Object store mode (S3-compatible backend) +# +# This test is designed to run with MinIO in CI: +# --mysqld=--tidesdb_object_store_backend=S3 +# --mysqld=--tidesdb_s3_endpoint=localhost:9000 +# --mysqld=--tidesdb_s3_bucket=tidesql-test +# --mysqld=--tidesdb_s3_access_key=minioadmin +# --mysqld=--tidesdb_s3_secret_key=minioadmin +# --mysqld=--tidesdb_s3_use_ssl=OFF +# --mysqld=--tidesdb_s3_path_style=ON +# +# When run without S3 config, tests still pass using local storage. +# + +--echo # +--echo # TEST 1: Basic CRUD over object store +--echo # + +CREATE TABLE t_obj ( + id INT NOT NULL PRIMARY KEY, + name VARCHAR(100), + data TEXT +) ENGINE=TidesDB; + +INSERT INTO t_obj VALUES (1, 'alpha', REPEAT('A', 500)); +INSERT INTO t_obj VALUES (2, 'beta', REPEAT('B', 500)); +INSERT INTO t_obj VALUES (3, 'gamma', REPEAT('C', 500)); +INSERT INTO t_obj VALUES (4, 'delta', REPEAT('D', 500)); +INSERT INTO t_obj VALUES (5, 'epsilon', REPEAT('E', 500)); + +SELECT id, name, LENGTH(data) FROM t_obj ORDER BY id; + +--echo # +--echo # TEST 2: UPDATE and DELETE +--echo # + +UPDATE t_obj SET name = 'ALPHA', data = REPEAT('X', 1000) WHERE id = 1; +DELETE FROM t_obj WHERE id = 3; + +SELECT id, name, LENGTH(data) FROM t_obj ORDER BY id; + +--echo # +--echo # TEST 3: Secondary index over object store +--echo # + +CREATE TABLE t_idx ( + id INT NOT NULL PRIMARY KEY, + category INT NOT NULL, + val VARCHAR(200), + KEY idx_cat (category) +) ENGINE=TidesDB; + +INSERT INTO t_idx VALUES (1, 10, 'widget'), (2, 20, 'gadget'), (3, 10, 'sprocket'); +INSERT INTO t_idx VALUES (4, 30, 'gizmo'), (5, 10, 'doohickey'); + +SELECT id, val FROM t_idx WHERE category = 10 ORDER BY id; + +--echo # +--echo # TEST 4: Transaction commit and rollback +--echo # + +BEGIN; +INSERT INTO t_obj VALUES (10, 'txn_test', 'committed'); +COMMIT; + +BEGIN; +INSERT INTO t_obj VALUES (11, 'txn_rollback', 'should_not_exist'); +ROLLBACK; + +SELECT id, name FROM t_obj WHERE id >= 10 ORDER BY id; + +--echo # +--echo # TEST 5: Bulk insert (triggers flush to SSTables -> S3 upload) +--echo # + +CREATE TABLE t_bulk ( + id INT NOT NULL PRIMARY KEY, + payload VARCHAR(500) +) ENGINE=TidesDB; + +--disable_query_log +let $i = 1; +while ($i <= 200) +{ + eval INSERT INTO t_bulk VALUES ($i, REPEAT('Z', 200)); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS bulk_count FROM t_bulk; + +--echo # +--echo # TEST 6: OPTIMIZE TABLE (triggers compaction -> S3 re-upload) +--echo # + +OPTIMIZE TABLE t_bulk; + +SELECT COUNT(*) AS after_optimize FROM t_bulk; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_obj; +DROP TABLE t_idx; +DROP TABLE t_bulk; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_online_ddl.opt b/mysql-test/suite/tidesdb/t/tidesdb_online_ddl.opt new file mode 100644 index 0000000000000..83434125bd516 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_online_ddl.opt @@ -0,0 +1 @@ +--loose-tidesdb-online-ddl-test=1 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_online_ddl.test b/mysql-test/suite/tidesdb/t/tidesdb_online_ddl.test new file mode 100644 index 0000000000000..a621de0239a5c --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_online_ddl.test @@ -0,0 +1,156 @@ +--source include/have_tidesdb.inc +# +# TidesDB Online DDL tests +# Tests INSTANT, INPLACE (add/drop index), and COPY fallback +# + +--echo # ---- Setup ---- +CREATE TABLE t_ddl ( + id INT PRIMARY KEY, + a INT, + b VARCHAR(100), + c INT DEFAULT 0 +) ENGINE=TidesDB; + +INSERT INTO t_ddl VALUES (1, 10, 'alpha', 100); +INSERT INTO t_ddl VALUES (2, 20, 'beta', 200); +INSERT INTO t_ddl VALUES (3, 30, 'gamma', 300); +INSERT INTO t_ddl VALUES (4, 10, 'delta', 400); +INSERT INTO t_ddl VALUES (5, 50, 'epsilon', 500); + +--echo # ---- INSTANT: change column default ---- +ALTER TABLE t_ddl ALTER COLUMN c SET DEFAULT 999, ALGORITHM=INSTANT; +INSERT INTO t_ddl (id, a, b) VALUES (6, 60, 'zeta'); +SELECT id, c FROM t_ddl WHERE id = 6; + +--echo # ---- INSTANT: rename column ---- +ALTER TABLE t_ddl CHANGE b b_name VARCHAR(100), ALGORITHM=INSTANT; +SELECT id, b_name FROM t_ddl WHERE id = 1; + +--echo # ---- INSTANT: change table option (SYNC_MODE) ---- +ALTER TABLE t_ddl SYNC_MODE='NONE', ALGORITHM=INSTANT; +SHOW CREATE TABLE t_ddl; + +--echo # ---- INPLACE: add secondary index ---- +ALTER TABLE t_ddl ADD INDEX idx_a (a), ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; + +--echo # Verify index is usable +SELECT id, a FROM t_ddl WHERE a = 10 ORDER BY id; +SELECT id, a FROM t_ddl WHERE a >= 30 ORDER BY a; + +--echo # ---- INPLACE: add another index ---- +ALTER TABLE t_ddl ADD INDEX idx_c (c), ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; +EXPLAIN SELECT id, c FROM t_ddl WHERE c = 200; +SELECT id, c FROM t_ddl WHERE c = 200; + +--echo # ---- INPLACE: drop index ---- +ALTER TABLE t_ddl DROP INDEX idx_a, ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; + +--echo # Verify remaining index still works +SELECT id, c FROM t_ddl WHERE c = 300; + +--echo # ---- INPLACE: add + drop in one statement ---- +ALTER TABLE t_ddl ADD INDEX idx_a2 (a), DROP INDEX idx_c, ALGORITHM=INPLACE; +SHOW INDEX FROM t_ddl; +EXPLAIN SELECT id, a FROM t_ddl WHERE a = 20; +SELECT id, a FROM t_ddl WHERE a = 20; + +--echo # ---- INSTANT: add column (NOT NULL DEFAULT) ---- +ALTER TABLE t_ddl ADD COLUMN d INT NOT NULL DEFAULT 0, ALGORITHM=INSTANT; +SELECT id, d FROM t_ddl WHERE id = 1; + +--echo # ---- Verify old rows readable after ADD COLUMN ---- +SELECT id, a, b_name, c, d FROM t_ddl ORDER BY id; + +--echo # ---- Insert with new schema and verify ---- +INSERT INTO t_ddl VALUES (7, 70, 'eta', 700, 42); +SELECT id, d FROM t_ddl WHERE id IN (1, 7) ORDER BY id; + +--echo # ---- INSTANT: drop column ---- +ALTER TABLE t_ddl DROP COLUMN d, ALGORITHM=INSTANT; +SELECT * FROM t_ddl WHERE id = 1; + +--echo # ---- Verify all rows readable after DROP COLUMN ---- +SELECT id, a, b_name, c FROM t_ddl ORDER BY id; + +--echo # ---- Cleanup ---- +DROP TABLE t_ddl; + +--echo # ---- Test with data and hidden PK (no explicit PK) ---- +CREATE TABLE t_nopk ( + a INT, + b VARCHAR(50) +) ENGINE=TidesDB; + +INSERT INTO t_nopk VALUES (1, 'one'); +INSERT INTO t_nopk VALUES (2, 'two'); +INSERT INTO t_nopk VALUES (3, 'three'); + +--echo # Add index on hidden-PK table +ALTER TABLE t_nopk ADD INDEX idx_a (a), ALGORITHM=INPLACE; +SELECT a, b FROM t_nopk WHERE a = 2; + +--echo # Drop it +ALTER TABLE t_nopk DROP INDEX idx_a, ALGORITHM=INPLACE; + +DROP TABLE t_nopk; + +--echo # ---- ADD UNIQUE must reject duplicates ---- +CREATE TABLE t_dup ( + i INT NOT NULL, + j INT NOT NULL DEFAULT 0 +) ENGINE=TidesDB; + +INSERT INTO t_dup VALUES (1, 0); +INSERT INTO t_dup VALUES (2, 0); +SELECT * FROM t_dup ORDER BY i; + +--error ER_DUP_ENTRY +ALTER TABLE t_dup ADD UNIQUE unq_j (j); + +# Both rows must still be present after the failed ALTER +SELECT * FROM t_dup ORDER BY i; +SELECT COUNT(*) FROM t_dup; + +DROP TABLE t_dup; + +--echo # ---- ADD FULLTEXT must back-fill pre-existing rows ---- +# Regression: inplace_alter_table used to skip FTS/SPATIAL keys, leaving the +# new CF empty until the next write_row. check_if_supported_inplace_alter +# now refuses ALGORITHM=INPLACE for these so MariaDB falls back to COPY, +# which routes every row through write_row. +CREATE TABLE t_ft ( + id INT PRIMARY KEY, + body VARCHAR(200) +) ENGINE=TidesDB; +INSERT INTO t_ft VALUES (1, 'tides db rocks'), (2, 'sql plugin lives'), (3, 'tides again'); + +# ALGORITHM=INPLACE must be rejected with a clear reason. +--error ER_ALTER_OPERATION_NOT_SUPPORTED_REASON +ALTER TABLE t_ft ADD FULLTEXT (body), ALGORITHM=INPLACE; + +# Default (= unspecified algorithm) must succeed via COPY and back-fill rows. +ALTER TABLE t_ft ADD FULLTEXT (body); +SELECT id FROM t_ft WHERE MATCH(body) AGAINST('tides') ORDER BY id; +DROP TABLE t_ft; + +--echo # ---- ADD SPATIAL must back-fill pre-existing rows ---- +CREATE TABLE t_sp ( + id INT PRIMARY KEY, + g GEOMETRY NOT NULL +) ENGINE=TidesDB; +INSERT INTO t_sp VALUES (1, ST_GeomFromText('POINT(0 0)')); +INSERT INTO t_sp VALUES (2, ST_GeomFromText('POINT(10 10)')); + +--error ER_ALTER_OPERATION_NOT_SUPPORTED_REASON +ALTER TABLE t_sp ADD SPATIAL INDEX (g), ALGORITHM=INPLACE; + +ALTER TABLE t_sp ADD SPATIAL INDEX (g); +SELECT id FROM t_sp WHERE MBRWithin(g, ST_GeomFromText('POLYGON((-1 -1, -1 5, 5 5, 5 -1, -1 -1))')) +ORDER BY id; +DROP TABLE t_sp; + +--source suite/tidesdb/include/cleanup_tidesdb.inc diff --git a/mysql-test/suite/tidesdb/t/tidesdb_options.test b/mysql-test/suite/tidesdb/t/tidesdb_options.test new file mode 100644 index 0000000000000..3c33e5b5f9979 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_options.test @@ -0,0 +1,182 @@ +--source include/have_tidesdb.inc +# +# Test suite for TIDESDB storage engine options. +# Exercises system variables and per-table CREATE TABLE options. +# + +--echo # +--echo # === Setup: install the TIDESDB engine plugin === +--echo # +--replace_regex /\.dll/.so/ + +--echo # +--echo # ============================================ +--echo # TEST 1: System variables - verify defaults +--echo # ============================================ +--echo # + +SHOW VARIABLES LIKE 'tidesdb_flush_threads'; +SHOW VARIABLES LIKE 'tidesdb_compaction_threads'; +SHOW VARIABLES LIKE 'tidesdb_log_level'; +SHOW VARIABLES LIKE 'tidesdb_block_cache_size'; +SHOW VARIABLES LIKE 'tidesdb_max_open_sstables'; +SHOW VARIABLES LIKE 'tidesdb_max_memory_usage'; + +--echo # +--echo # ============================================ +--echo # TEST 2: CREATE TABLE with default options +--echo # ============================================ +--echo # + +CREATE TABLE t_defaults (id INT, val VARCHAR(50)) ENGINE=TIDESDB; +SHOW CREATE TABLE t_defaults; +INSERT INTO t_defaults VALUES (1, 'default_opts'); +SELECT * FROM t_defaults; +DROP TABLE t_defaults; + +--echo # +--echo # ============================================ +--echo # TEST 3: CREATE TABLE with custom compression +--echo # ============================================ +--echo # + +CREATE TABLE t_none (id INT, val VARCHAR(50)) ENGINE=TIDESDB COMPRESSION='NONE'; +SHOW CREATE TABLE t_none; +INSERT INTO t_none VALUES (1, 'no compression'); +SELECT * FROM t_none; +DROP TABLE t_none; + +CREATE TABLE t_zstd (id INT, val VARCHAR(50)) ENGINE=TIDESDB COMPRESSION='ZSTD'; +SHOW CREATE TABLE t_zstd; +INSERT INTO t_zstd VALUES (1, 'zstd compressed'); +SELECT * FROM t_zstd; +DROP TABLE t_zstd; + +--echo # +--echo # ============================================ +--echo # TEST 4: CREATE TABLE with custom bloom filter +--echo # ============================================ +--echo # + +CREATE TABLE t_nobloom (id INT, val VARCHAR(50)) ENGINE=TIDESDB BLOOM_FILTER=0; +SHOW CREATE TABLE t_nobloom; +INSERT INTO t_nobloom VALUES (1, 'no bloom'); +SELECT * FROM t_nobloom; +DROP TABLE t_nobloom; + +CREATE TABLE t_lowfpr (id INT, val VARCHAR(50)) ENGINE=TIDESDB BLOOM_FPR=10; +SHOW CREATE TABLE t_lowfpr; +INSERT INTO t_lowfpr VALUES (1, 'low fpr 0.1%'); +SELECT * FROM t_lowfpr; +DROP TABLE t_lowfpr; + +--echo # +--echo # ============================================ +--echo # TEST 5: CREATE TABLE with custom write buffer +--echo # ============================================ +--echo # + +CREATE TABLE t_bigbuf (id INT, val VARCHAR(50)) ENGINE=TIDESDB WRITE_BUFFER_SIZE=16777216; +SHOW CREATE TABLE t_bigbuf; +INSERT INTO t_bigbuf VALUES (1, '16MB write buffer'); +SELECT * FROM t_bigbuf; +DROP TABLE t_bigbuf; + +--echo # +--echo # ============================================ +--echo # TEST 6: CREATE TABLE with sync mode options +--echo # ============================================ +--echo # + +CREATE TABLE t_syncnone (id INT) ENGINE=TIDESDB SYNC_MODE='NONE'; +SHOW CREATE TABLE t_syncnone; +INSERT INTO t_syncnone VALUES (1); +SELECT * FROM t_syncnone; +DROP TABLE t_syncnone; + +CREATE TABLE t_syncint (id INT) ENGINE=TIDESDB SYNC_MODE='INTERVAL' SYNC_INTERVAL_US=500000; +SHOW CREATE TABLE t_syncint; +INSERT INTO t_syncint VALUES (1); +SELECT * FROM t_syncint; +DROP TABLE t_syncint; + +--echo # +--echo # ============================================ +--echo # TEST 7: CREATE TABLE with isolation level +--echo # ============================================ +--echo # + +CREATE TABLE t_rc (id INT, val VARCHAR(50)) ENGINE=TIDESDB ISOLATION_LEVEL='READ_COMMITTED'; +SHOW CREATE TABLE t_rc; +INSERT INTO t_rc VALUES (1, 'read committed'); +SELECT * FROM t_rc; +DROP TABLE t_rc; + +CREATE TABLE t_ser (id INT, val VARCHAR(50)) ENGINE=TIDESDB ISOLATION_LEVEL='SERIALIZABLE'; +SHOW CREATE TABLE t_ser; +INSERT INTO t_ser VALUES (1, 'serializable'); +SELECT * FROM t_ser; +DROP TABLE t_ser; + +--echo # +--echo # ============================================ +--echo # TEST 8: CREATE TABLE with B+tree format +--echo # ============================================ +--echo # + +CREATE TABLE t_btree (id INT, val VARCHAR(50)) ENGINE=TIDESDB USE_BTREE=1; +SHOW CREATE TABLE t_btree; +INSERT INTO t_btree VALUES (1, 'btree format'); +SELECT * FROM t_btree; +DROP TABLE t_btree; + +--echo # +--echo # ============================================ +--echo # TEST 9: CREATE TABLE with multiple options +--echo # ============================================ +--echo # + +CREATE TABLE t_multi ( + id INT, + val VARCHAR(100) +) ENGINE=TIDESDB + COMPRESSION='ZSTD' + WRITE_BUFFER_SIZE=8388608 + BLOOM_FILTER=1 + BLOOM_FPR=50 + BLOCK_INDEXES=1 + SYNC_MODE='FULL' + ISOLATION_LEVEL='REPEATABLE_READ' + LEVEL_SIZE_RATIO=8 + MIN_LEVELS=3 + SKIP_LIST_MAX_LEVEL=16 + SKIP_LIST_PROBABILITY=50; + +SHOW CREATE TABLE t_multi; +INSERT INTO t_multi VALUES (1, 'multi-option table'); +INSERT INTO t_multi VALUES (2, 'second row'); +SELECT * FROM t_multi; +UPDATE t_multi SET val = 'updated' WHERE id = 1; +SELECT * FROM t_multi; +DELETE FROM t_multi WHERE id = 2; +SELECT * FROM t_multi; +DROP TABLE t_multi; + +--echo # +--echo # ============================================ +--echo # TEST 10: Default isolation is REPEATABLE_READ +--echo # ============================================ +--echo # + +CREATE TABLE t_default_iso (id INT) ENGINE=TIDESDB; +SHOW CREATE TABLE t_default_iso; +INSERT INTO t_default_iso VALUES (1), (2), (3); +SELECT * FROM t_default_iso; +DROP TABLE t_default_iso; + +--echo # +--echo # + + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_partition.test b/mysql-test/suite/tidesdb/t/tidesdb_partition.test new file mode 100644 index 0000000000000..b479e92ce622e --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_partition.test @@ -0,0 +1,230 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc +--source include/have_partition.inc + + +--echo # +--echo # ============================================ +--echo # TEST 1: HASH partitioning +--echo # ============================================ +--echo # + +CREATE TABLE t_hash ( + id INT NOT NULL, + val VARCHAR(50), + PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY HASH(id) PARTITIONS 4; + +INSERT INTO t_hash VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e'),(6,'f'),(7,'g'),(8,'h'); + +SELECT * FROM t_hash ORDER BY id; +SELECT COUNT(*) AS total FROM t_hash; + +--echo # Update across potential partition boundary +UPDATE t_hash SET val = 'updated' WHERE id = 3; +SELECT * FROM t_hash WHERE id = 3; + +--echo # Delete +DELETE FROM t_hash WHERE id IN (2, 5); +SELECT * FROM t_hash ORDER BY id; + +DROP TABLE t_hash; + +--echo # +--echo # ============================================ +--echo # TEST 2: KEY partitioning +--echo # ============================================ +--echo # + +CREATE TABLE t_key ( + id INT NOT NULL, + name VARCHAR(50), + PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY KEY(id) PARTITIONS 3; + +INSERT INTO t_key VALUES (1,'alice'),(2,'bob'),(3,'charlie'),(4,'dave'),(5,'eve'),(6,'frank'); + +SELECT * FROM t_key ORDER BY id; + +DELETE FROM t_key WHERE id = 4; +SELECT * FROM t_key ORDER BY id; + +DROP TABLE t_key; + +--echo # +--echo # ============================================ +--echo # TEST 3: RANGE partitioning +--echo # ============================================ +--echo # + +CREATE TABLE t_range ( + id INT NOT NULL, + val VARCHAR(50), + PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY RANGE(id) ( + PARTITION p0 VALUES LESS THAN (10), + PARTITION p1 VALUES LESS THAN (20), + PARTITION p2 VALUES LESS THAN (30), + PARTITION p3 VALUES LESS THAN MAXVALUE +); + +INSERT INTO t_range VALUES (1,'r0'),(5,'r0'),(9,'r0'); +INSERT INTO t_range VALUES (10,'r1'),(15,'r1'),(19,'r1'); +INSERT INTO t_range VALUES (20,'r2'),(25,'r2'); +INSERT INTO t_range VALUES (30,'r3'),(50,'r3'),(100,'r3'); + +SELECT * FROM t_range ORDER BY id; +SELECT COUNT(*) AS total FROM t_range; + +--echo # Query that should hit only partition p1 +SELECT * FROM t_range WHERE id >= 10 AND id < 20 ORDER BY id; + +--echo # Delete from specific range +DELETE FROM t_range WHERE id >= 20 AND id < 30; +SELECT * FROM t_range ORDER BY id; + +--echo # Update across range boundary +UPDATE t_range SET val = 'moved' WHERE id = 5; +SELECT * FROM t_range WHERE id = 5; + +DROP TABLE t_range; + +--echo # +--echo # ============================================ +--echo # TEST 4: LIST partitioning +--echo # ============================================ +--echo # + +CREATE TABLE t_list ( + id INT NOT NULL, + region INT NOT NULL, + name VARCHAR(50), + PRIMARY KEY (id, region) +) ENGINE=TIDESDB +PARTITION BY LIST(region) ( + PARTITION p_east VALUES IN (1, 2, 3), + PARTITION p_west VALUES IN (4, 5, 6), + PARTITION p_central VALUES IN (7, 8, 9) +); + +INSERT INTO t_list VALUES (1,1,'NY'),(2,2,'NJ'),(3,3,'CT'); +INSERT INTO t_list VALUES (4,4,'CA'),(5,5,'OR'),(6,6,'WA'); +INSERT INTO t_list VALUES (7,7,'IL'),(8,8,'OH'),(9,9,'MI'); + +SELECT * FROM t_list ORDER BY id; + +--echo # Query specific list partition +SELECT * FROM t_list WHERE region IN (4,5,6) ORDER BY id; + +DELETE FROM t_list WHERE region = 8; +SELECT * FROM t_list ORDER BY id; + +DROP TABLE t_list; + +--echo # +--echo # ============================================ +--echo # TEST 5: RANGE COLUMNS partitioning +--echo # ============================================ +--echo # + +CREATE TABLE t_range_col ( + id INT NOT NULL, + created DATE NOT NULL, + val VARCHAR(50), + PRIMARY KEY (id, created) +) ENGINE=TIDESDB +PARTITION BY RANGE COLUMNS(created) ( + PARTITION p_2024 VALUES LESS THAN ('2025-01-01'), + PARTITION p_2025 VALUES LESS THAN ('2026-01-01'), + PARTITION p_future VALUES LESS THAN MAXVALUE +); + +INSERT INTO t_range_col VALUES (1,'2024-06-15','old'),(2,'2024-12-31','old'); +INSERT INTO t_range_col VALUES (3,'2025-03-10','current'),(4,'2025-11-20','current'); +INSERT INTO t_range_col VALUES (5,'2026-05-01','future'); + +SELECT * FROM t_range_col ORDER BY created; + +--echo # Query specific partition by date range +SELECT * FROM t_range_col WHERE created >= '2025-01-01' AND created < '2026-01-01' ORDER BY id; + +DROP TABLE t_range_col; + +--echo # +--echo # ============================================ +--echo # TEST 6: Partition with secondary index +--echo # ============================================ +--echo # + +CREATE TABLE t_part_idx ( + id INT NOT NULL, + category INT, + name VARCHAR(50), + PRIMARY KEY (id), + KEY idx_cat (category) +) ENGINE=TIDESDB +PARTITION BY HASH(id) PARTITIONS 3; + +INSERT INTO t_part_idx VALUES (1,10,'a'),(2,20,'b'),(3,10,'c'),(4,30,'d'),(5,20,'e'),(6,10,'f'); + +--echo # Scan via secondary index across partitions +SELECT * FROM t_part_idx WHERE category = 10 ORDER BY id; +SELECT * FROM t_part_idx WHERE category = 20 ORDER BY id; + +DROP TABLE t_part_idx; + +--echo # +--echo # ============================================ +--echo # TEST 7: ALTER TABLE add/drop partition (RANGE) +--echo # ============================================ +--echo # + +CREATE TABLE t_alter_part ( + id INT NOT NULL, + val VARCHAR(50), + PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY RANGE(id) ( + PARTITION p0 VALUES LESS THAN (100), + PARTITION p1 VALUES LESS THAN (200) +); + +INSERT INTO t_alter_part VALUES (1,'lo'),(50,'lo'),(100,'hi'),(150,'hi'); +SELECT * FROM t_alter_part ORDER BY id; + +--echo # Add a new partition +ALTER TABLE t_alter_part ADD PARTITION (PARTITION p2 VALUES LESS THAN MAXVALUE); + +INSERT INTO t_alter_part VALUES (200,'new'),(300,'new'); +SELECT * FROM t_alter_part ORDER BY id; + +--echo # Drop a partition (removes data in that range) +ALTER TABLE t_alter_part DROP PARTITION p1; +SELECT * FROM t_alter_part ORDER BY id; + +DROP TABLE t_alter_part; + +--echo # +--echo # ============================================ +--echo # TEST 8: SHOW CREATE TABLE with partitions +--echo # ============================================ +--echo # + +CREATE TABLE t_show_part ( + id INT NOT NULL, + val VARCHAR(50), + PRIMARY KEY (id) +) ENGINE=TIDESDB +PARTITION BY HASH(id) PARTITIONS 2; + +SHOW CREATE TABLE t_show_part; + +DROP TABLE t_show_part; + +--echo # +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_per_index_btree.test b/mysql-test/suite/tidesdb/t/tidesdb_per_index_btree.test new file mode 100644 index 0000000000000..ec401167b522d --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_per_index_btree.test @@ -0,0 +1,47 @@ +--source include/have_tidesdb.inc +# +# Issue #79: Per-index USE_BTREE option +# + +--echo # +--echo # TEST 1: Per-index USE_BTREE on secondary index +--echo # + +CREATE TABLE t1 ( + id INT NOT NULL PRIMARY KEY, + a INT, + b INT, + KEY idx_a (a) USE_BTREE=1, + KEY idx_b (b) +) ENGINE=TidesDB; + +INSERT INTO t1 VALUES (1,10,100),(2,20,200),(3,30,300); + +--echo # idx_a should show BTREE, idx_b should show LSM +SHOW KEYS FROM t1; + +SELECT * FROM t1 WHERE a = 20; +SELECT * FROM t1 WHERE b = 200; + +DROP TABLE t1; + +--echo # +--echo # TEST 2: Table-level USE_BTREE=1 with per-index override +--echo # + +CREATE TABLE t2 ( + id INT NOT NULL PRIMARY KEY, + x INT, + KEY idx_x (x) USE_BTREE=0 +) ENGINE=TidesDB USE_BTREE=1; + +--echo # PK and idx_x should both show BTREE (table default), but idx_x USE_BTREE=0 +--echo # Note: per-index USE_BTREE=0 does NOT override table-level to LSM -- it just +--echo # means the index itself didn't request BTREE; the table default still applies. +SHOW KEYS FROM t2; + +DROP TABLE t2; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_chain_bounded.opt b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_chain_bounded.opt new file mode 100644 index 0000000000000..5edec0cc9eaee --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_chain_bounded.opt @@ -0,0 +1 @@ +--tidesdb-pessimistic-locking=ON diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_chain_bounded.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_chain_bounded.test new file mode 100644 index 0000000000000..6449c55c04c08 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_chain_bounded.test @@ -0,0 +1,113 @@ +--source include/have_tidesdb.inc +# +# Test that the pessimistic-lock hash chain stays bounded under churn. +# Insert and delete several thousand distinct primary keys across two +# sessions; lock entries should travel onto the partition freelist on +# release and be reused on the next acquire, so tidesdb_lock_chain_max +# must not grow anywhere near the count of keys touched. +# +# Pre-fix this test would have driven chain_max into the hundreds. The +# bound below is generous so transient hash skew from a hot key does +# not flake the test; the real signal is "tens, not thousands." +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE churn ( + id INT NOT NULL PRIMARY KEY, + val INT +) ENGINE=TidesDB; + +# Snapshot the gauge so prior tests in the suite cannot leak into us. +--disable_query_log +SET @chain_max_before = (SELECT VARIABLE_VALUE + FROM information_schema.global_status + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_CHAIN_MAX'); +--enable_query_log + +connect (conA, localhost, root,,); +connect (conB, localhost, root,,); + +--echo # +--echo # Each session churns 2500 unique PKs in batches of 50. Every +--echo # batch commits, releasing all its row locks; the next batch +--echo # acquires fresh locks that should land on freelisted slots. +--echo # + +--disable_query_log +let $batch = 0; +while ($batch < 50) +{ + connection conA; + BEGIN; + let $i = 0; + while ($i < 50) + { + eval INSERT INTO churn VALUES ($batch * 100 + $i, $i); + inc $i; + } + COMMIT; + + connection conB; + BEGIN; + let $i = 0; + while ($i < 50) + { + eval INSERT INTO churn VALUES ($batch * 100 + 50 + $i, $i); + inc $i; + } + COMMIT; + + connection conA; + BEGIN; + eval DELETE FROM churn WHERE id >= $batch * 100 AND id < $batch * 100 + 50; + COMMIT; + + connection conB; + BEGIN; + eval DELETE FROM churn WHERE id >= $batch * 100 + 50 AND id < $batch * 100 + 100; + COMMIT; + + inc $batch; +} +--enable_query_log + +connection default; + +# 5000 distinct keys went through the lock table. Assert chain_max +# did not grow more than a small bound; that is only possible if +# released slots were unlinked from the chain. +--disable_query_log +SET @chain_max_after = (SELECT VARIABLE_VALUE + FROM information_schema.global_status + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_CHAIN_MAX'); +SET @chain_grew = CAST(@chain_max_after AS SIGNED) - CAST(@chain_max_before AS SIGNED); + +# Sanity: should be a small positive number, not thousands. +SET @verdict = IF(@chain_grew <= 64, 'CHAIN_BOUNDED', CONCAT('CHAIN_GREW_TO_', @chain_grew)); +--enable_query_log + +SELECT @verdict; + +# Counts: 2500 + 2500 inserted, all deleted, table empty. +SELECT COUNT(*) FROM churn; + +# Sanity that the freelist actually got exercised. recycles > 0 means +# the next acquire after a release reused a slot rather than mallocing +# a fresh entry, which is the whole point. +--disable_query_log +SET @recycles = (SELECT VARIABLE_VALUE + FROM information_schema.global_status + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_ENTRY_RECYCLES'); +SET @recycled_some = IF(CAST(@recycles AS UNSIGNED) > 0, 'RECYCLED', 'NO_RECYCLE'); +--enable_query_log + +SELECT @recycled_some; + +disconnect conA; +disconnect conB; + +DROP TABLE churn; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_deadlock_cycle.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_deadlock_cycle.test new file mode 100644 index 0000000000000..acafae3e5092f --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_deadlock_cycle.test @@ -0,0 +1,76 @@ +--source include/have_tidesdb.inc +# +# Exercises the pessimistic lock manager's wait-for graph traversal by +# forcing a real two-row cycle. T1 holds X on row 1 and waits for X on +# row 2; T2 holds X on row 2 and asks for X on row 1. The walker on T2's +# acquire must follow T1's wait edge back to T2's grant and return +# HA_ERR_LOCK_DEADLOCK, which MariaDB surfaces as ER_LOCK_DEADLOCK. +# +# The test is timing sensitive: T2's walker only sees the cycle if T1's +# --send UPDATE has actually reached the wait loop and published its +# waiting_on_lock pointer. We poll the Tidesdb_lock_waits counter to +# observe T1 entering the wait state before firing T2's query, which +# makes the cycle deterministic. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE c ( + id INT PRIMARY KEY, + v INT NOT NULL +) ENGINE=TidesDB; + +INSERT INTO c VALUES (1, 10), (2, 20); + +connect (a, localhost, root,,); +connect (b, localhost, root,,); + +connection a; +BEGIN; +UPDATE c SET v = v + 1 WHERE id = 1; + +connection b; +BEGIN; +UPDATE c SET v = v + 1 WHERE id = 2; + +# Snapshot the lock-wait counter so we can detect T1 entering cond_wait. +connection default; +--disable_query_log +let $waits_before = `SELECT VARIABLE_VALUE+0 FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_LOCK_WAITS'`; +--enable_query_log + +connection a; +--send UPDATE c SET v = v + 1 WHERE id = 2 + +# Wait until a's UPDATE has actually blocked in row_lock_acquire. Until +# Tidesdb_lock_waits has incremented, a's waiting_on_lock is still null +# and b's walker would not see a cycle. +connection default; +let $wait_condition = + SELECT (VARIABLE_VALUE+0) > $waits_before + FROM information_schema.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_WAITS'; +--source include/wait_condition.inc + +# T1 is now parked on row 2; T2's request for row 1 closes the cycle. +connection b; +--error ER_LOCK_DEADLOCK +UPDATE c SET v = v + 1 WHERE id = 1; +ROLLBACK; + +# T2's rollback released its X on row 2, so T1's --send completes. +connection a; +--reap +COMMIT; + +connection default; +--echo # Row 1 incremented by T1 only (T2 aborted); row 2 incremented by T1 only. +SELECT * FROM c ORDER BY id; + +disconnect a; +disconnect b; +connection default; +DROP TABLE c; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_forupdate.opt b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_forupdate.opt new file mode 100644 index 0000000000000..5edec0cc9eaee --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_forupdate.opt @@ -0,0 +1 @@ +--tidesdb-pessimistic-locking=ON diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_forupdate.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_forupdate.test new file mode 100644 index 0000000000000..4cf8efd43c3ca --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_forupdate.test @@ -0,0 +1,127 @@ +--source include/have_tidesdb.inc +# +# Test: Pessimistic locking with SELECT ... FOR UPDATE +# Verifies that the store_lock() fix correctly detects FOR UPDATE +# and acquires pessimistic row locks, serializing concurrent +# read-modify-write cycles on the same row (TPC-C NEWORD pattern). +# +# With pessimistic_locking=ON: +# - Both connections should succeed (serialized via row lock) +# - Counter should increment by exactly 2 (no lost updates) +# - Zero conflict errors +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +# pessimistic_locking=ON is set via .opt file + +--echo # +--echo # Setup: TPC-C district-like table +--echo # + +CREATE TABLE district ( + d_w_id INT NOT NULL, + d_id INT NOT NULL, + d_next_o_id INT NOT NULL, + d_tax DECIMAL(4,4), + PRIMARY KEY (d_w_id, d_id) +) ENGINE=TidesDB; + +INSERT INTO district VALUES (1, 1, 3001, 0.1000); + +--echo # +--echo # TEST 1: Two concurrent SELECT FOR UPDATE + UPDATE +--echo # on the same row. Both should succeed with pessimistic +--echo # locking serializing access. Counter = 3001 + 2 = 3003 +--echo # + +connect (conA, localhost, root,,); +connect (conB, localhost, root,,); + +connection conA; +BEGIN; +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1 FOR UPDATE; + +connection conB; +# conB's UPDATE should block on the pessimistic row lock until conA commits +--send UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1 + +connection conA; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +COMMIT; + +connection conB; +--reap + +connection default; +--echo # Both succeeded: 3001 + 1 (conA) + 1 (conB) = 3003 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # TEST 2: Stored procedure with SELECT FOR UPDATE +--echo # Mimics TPC-C NEWORD pattern inside a CALL +--echo # + +DELIMITER |; +CREATE PROCEDURE neword_mini(IN p_w_id INT, IN p_d_id INT) +BEGIN + DECLARE v_next_o_id INT; + SELECT d_next_o_id INTO v_next_o_id + FROM district WHERE d_w_id = p_w_id AND d_id = p_d_id FOR UPDATE; + UPDATE district SET d_next_o_id = v_next_o_id + 1 + WHERE d_w_id = p_w_id AND d_id = p_d_id; +END| +DELIMITER ;| + +UPDATE district SET d_next_o_id = 5001 WHERE d_w_id=1 AND d_id=1; + +connection conA; +BEGIN; +CALL neword_mini(1, 1); + +connection conB; +--send CALL neword_mini(1, 1) + +connection conA; +COMMIT; + +connection conB; +--reap + +connection default; +--echo # Both CALL succeeded: 5001 + 1 + 1 = 5003 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # TEST 3: Serial counter increment (10 iterations) +--echo # + +UPDATE district SET d_next_o_id = 6001 WHERE d_w_id=1 AND d_id=1; + +--disable_query_log +let $i = 0; +while ($i < 10) +{ + CALL neword_mini(1, 1); + inc $i; +} +--enable_query_log + +--echo # Should be 6001 + 10 = 6011 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # Cleanup +--echo # + +disconnect conA; +disconnect conB; +connection default; + +DROP PROCEDURE neword_mini; +DROP TABLE district; +# Note: pessimistic_locking was set to ON via .opt file; +# we leave it ON so MTR state check does not complain. + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_insert_lock.opt b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_insert_lock.opt new file mode 100644 index 0000000000000..5edec0cc9eaee --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_insert_lock.opt @@ -0,0 +1 @@ +--tidesdb-pessimistic-locking=ON diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_insert_lock.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_insert_lock.test new file mode 100644 index 0000000000000..281e479ef31fa --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_insert_lock.test @@ -0,0 +1,227 @@ +--source include/have_tidesdb.inc +# +# Test: Pessimistic locking edge cases from GitHub issue +# +# Covers: +# 1. Non-existing rows can be locked by SELECT FOR UPDATE +# 2. DELETE and UPDATE acquire locks (not just SELECT FOR UPDATE) +# 3. INSERT respects locks held on the same PK +# 4. INSERT on a non-existing locked key blocks correctly +# 5. Concurrent INSERTs on different keys do not block +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +--echo # +--echo # Setup +--echo # + +CREATE TABLE t ( + i INT, + PRIMARY KEY (i) +) ENGINE=TidesDB; + +INSERT INTO t VALUES (1), (2), (3), (4), (5); + +connect (conA, localhost, root,,); +connect (conB, localhost, root,,); + +--echo # +--echo # TEST 1: SELECT FOR UPDATE on non-existing row blocks DELETE +--echo # Connection A locks i=15 (does not exist). +--echo # Connection B deletes i=2 (succeeds immediately), +--echo # then tries to delete i=15 (must block). +--echo # + +connection conA; +BEGIN; +SELECT * FROM t WHERE i = 15 FOR UPDATE; + +connection conB; +DELETE FROM t WHERE i = 2; +--send DELETE FROM t WHERE i = 15 + +connection conA; +--sleep 0.5 +COMMIT; + +connection conB; +--reap + +connection default; +--echo # i=2 and i=15 both deleted (i=15 was no-op but lock was respected) +SELECT * FROM t ORDER BY i; + +--echo # +--echo # TEST 2: DELETE acquires a lock that blocks another DELETE +--echo # Connection A deletes i=3 inside a transaction. +--echo # Connection B deletes i=4 (succeeds immediately), +--echo # then tries to delete i=3 (must block until A commits). +--echo # + +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); + +connection conA; +BEGIN; +DELETE FROM t WHERE i = 3; + +connection conB; +DELETE FROM t WHERE i = 4; +--send DELETE FROM t WHERE i = 3 + +connection conA; +--sleep 0.5 +COMMIT; + +connection conB; +--reap + +connection default; +--echo # i=3 and i=4 both deleted +SELECT * FROM t ORDER BY i; + +--echo # +--echo # TEST 3: UPDATE acquires a lock that blocks another UPDATE +--echo # + +DROP TABLE t; +CREATE TABLE t (i INT, v INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1, 10), (2, 20), (3, 30); + +connection conA; +BEGIN; +UPDATE t SET v = 99 WHERE i = 3; + +connection conB; +UPDATE t SET v = 88 WHERE i = 2; +--send UPDATE t SET v = 77 WHERE i = 3 + +connection conA; +--sleep 0.5 +COMMIT; + +connection conB; +--reap + +connection default; +--echo # conA set v=99, then conB overwrote with v=77 +SELECT * FROM t ORDER BY i; + +--echo # +--echo # TEST 4: INSERT blocked by SELECT FOR UPDATE on non-existing key +--echo # This is the critical fix -- previously INSERT bypassed the lock. +--echo # Connection A does SELECT FOR UPDATE on i=15 (non-existing). +--echo # Connection B tries INSERT i=15 (must block until A commits). +--echo # + +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); + +connection conA; +BEGIN; +SELECT * FROM t WHERE i = 15 FOR UPDATE; + +connection conB; +--send INSERT INTO t VALUES (15) + +connection conA; +--sleep 0.5 +COMMIT; + +connection conB; +--reap + +connection default; +--echo # i=15 now exists (inserted by conB after conA released the lock) +SELECT * FROM t WHERE i >= 10 ORDER BY i; + +--echo # +--echo # TEST 5: INSERT blocked by DELETE on existing row +--echo # Connection A deletes i=3 inside a transaction. +--echo # Connection B tries to INSERT i=3 (must block). +--echo # + +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); + +connection conA; +BEGIN; +DELETE FROM t WHERE i = 3; + +connection conB; +--send INSERT INTO t VALUES (3) + +connection conA; +--sleep 0.5 +COMMIT; + +connection conB; +--reap + +connection default; +--echo # i=3 was deleted by conA, then re-inserted by conB +SELECT * FROM t ORDER BY i; + +--echo # +--echo # TEST 6: Concurrent INSERTs on different keys do not block +--echo # + +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; + +connection conA; +BEGIN; +INSERT INTO t VALUES (100); + +connection conB; +INSERT INTO t VALUES (200); + +connection conA; +COMMIT; + +connection default; +--echo # Both inserts succeeded without blocking +SELECT * FROM t ORDER BY i; + +--echo # +--echo # TEST 7: Autocommit UPDATE blocked by SELECT FOR UPDATE +--echo # + +DROP TABLE t; +CREATE TABLE t (i INT, PRIMARY KEY (i)) ENGINE=TidesDB; +INSERT INTO t VALUES (1), (2), (3), (4), (5); + +connection conA; +BEGIN; +SELECT * FROM t WHERE i = 3 FOR UPDATE; + +connection conB; +--send UPDATE t SET i = 33 WHERE i = 3 + +connection conA; +--sleep 0.5 +COMMIT; + +connection conB; +--reap + +connection default; +--echo # conA released lock, then conB's autocommit UPDATE renamed i=3 to i=33 +SELECT * FROM t ORDER BY i; + +--echo # +--echo # Cleanup +--echo # + +disconnect conA; +disconnect conB; +connection default; + +DROP TABLE t; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_killwait.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_killwait.test new file mode 100644 index 0000000000000..d05366fbcb356 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_killwait.test @@ -0,0 +1,62 @@ +--source include/have_tidesdb.inc +# +# KILL QUERY during a pessimistic lock wait. T1 holds X on row 1. T2 +# fires an UPDATE on the same row, which blocks in row_lock_acquire's +# cond_wait. KILL QUERY wakes T2 via the +# handlerton-level kill_query callback, the wait loop observes +# thd_killed(), and the statement aborts. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE k ( + id INT PRIMARY KEY, + v INT NOT NULL +) ENGINE=TidesDB; + +INSERT INTO k VALUES (1, 100); + +connect (a, localhost, root,,); +connect (b, localhost, root,,); +connect (killer, localhost, root,,); + +connection a; +BEGIN; +UPDATE k SET v = v + 1 WHERE id = 1; + +connection b; +let $b_id = `SELECT CONNECTION_ID()`; +BEGIN; +--send UPDATE k SET v = v + 1 WHERE id = 1 + +connection killer; +let $wait_condition = + SELECT COUNT(*) >= 1 FROM information_schema.processlist + WHERE ID = $b_id AND STATE LIKE '%lock%' OR STATE = 'Updating'; +--source include/wait_condition.inc + +--disable_query_log +eval KILL QUERY $b_id; +--enable_query_log +--echo # KILL QUERY issued against the blocked UPDATE on connection b. + +connection b; +--error ER_QUERY_INTERRUPTED,ER_LOCK_WAIT_TIMEOUT +--reap +ROLLBACK; + +connection a; +COMMIT; + +connection default; +--echo # Row 1 incremented by T1 only. +SELECT * FROM k WHERE id = 1; + +disconnect a; +disconnect b; +disconnect killer; +connection default; +DROP TABLE k; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_reentry.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_reentry.test new file mode 100644 index 0000000000000..146593e512324 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_reentry.test @@ -0,0 +1,50 @@ +--source include/have_tidesdb.inc +# +# Re-entry semantics. The lock manager treats a second acquire of the +# same lock by the same trx as a no-op when the requested mode is the +# same or weaker than what the trx already holds (X subsumes S). The +# test exercises this by running multiple SELECT FOR UPDATE statements +# on the same row inside one transaction and confirms no spurious +# error or wait. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE r ( + id INT PRIMARY KEY, + v INT NOT NULL +) ENGINE=TidesDB; + +INSERT INTO r VALUES (1, 100); + +connect (a, localhost, root,,); + +connection a; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; + +# First X acquire on row 1. +SELECT v FROM r WHERE id = 1 FOR UPDATE; + +# Second X acquire on the same row should be a no-op. +SELECT v FROM r WHERE id = 1 FOR UPDATE; + +# An UPDATE that re-resolves to the same PK also re-uses the held X. +UPDATE r SET v = v + 1 WHERE id = 1; + +# A plain SELECT on the same row under REPEATABLE-READ asks for S, +# which is subsumed by the X already held; still a no-op. +SELECT v FROM r WHERE id = 1; + +COMMIT; + +connection default; +--echo # Row 1 incremented exactly once. +SELECT * FROM r WHERE id = 1; + +disconnect a; +connection default; +DROP TABLE r; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_shared.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_shared.test new file mode 100644 index 0000000000000..e10b169d3b263 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_shared.test @@ -0,0 +1,190 @@ +--source include/have_tidesdb.inc +# +# Test: pessimistic S/X lock manager +# +# Verifies the four invariants of the two-mode lock manager: +# 1. S / S compatible -- two readers under REPEATABLE-READ on the same +# row both hold S concurrently with no blocking +# 2. X waits for S readers -- an UPDATE blocks while readers hold S, +# then proceeds after every S releases +# 3. Writer fairness -- a new S blocks while an X is queued, so a +# stream of readers can't starve a writer +# 4. RC / SNAPSHOT reads take no lock -- a plain SELECT under +# READ-COMMITTED returns immediately even with an X held on the row +# +# pessimistic_locking is ON by default in this engine, no .opt needed. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE acct ( + id INT PRIMARY KEY, + bal INT NOT NULL +) ENGINE=TidesDB; + +INSERT INTO acct VALUES (1, 100); + +connect (s1, localhost, root,,); +connect (s2, localhost, root,,); +connect (s3, localhost, root,,); + +--echo # +--echo # TEST 1: S / S compatible under REPEATABLE-READ +--echo # Both s1 and s2 acquire S on the same row, neither blocks. +--echo # + +connection s1; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; + +connection s2; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; + +--echo # Both holders of S read successfully -- no deadlock, no block. +connection default; +SELECT bal FROM acct WHERE id = 1; + +connection s1; +COMMIT; +connection s2; +COMMIT; + +--echo # +--echo # TEST 2: X waits for S readers, then proceeds +--echo # s1 + s2 hold S; s3 fires UPDATE that must wait until +--echo # both readers release. +--echo # + +connection s1; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; + +connection s2; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; + +connection s3; +BEGIN; +--send UPDATE acct SET bal = bal + 50 WHERE id = 1 + +# Release readers; X should proceed after the second one commits. +connection s1; +COMMIT; +connection s2; +COMMIT; + +connection s3; +--reap +COMMIT; + +connection default; +--echo # 100 + 50 = 150 +SELECT bal FROM acct WHERE id = 1; + +--echo # +--echo # TEST 3: writer fairness -- new S blocks behind a waiting X +--echo # s1 holds S; s2 fires UPDATE (X-waiting); s3 fires a +--echo # SELECT under REPEATABLE-READ that wants S. s3 must +--echo # NOT jump ahead of s2's queued X. +--echo # + +UPDATE acct SET bal = 200 WHERE id = 1; + +connection s1; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; + +# Snapshot the lock-wait counter so we can detect s2 entering cond_wait. +connection default; +--disable_query_log +let $waits_before_s2 = `SELECT VARIABLE_VALUE+0 FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_LOCK_WAITS'`; +--enable_query_log + +connection s2; +BEGIN; +--send UPDATE acct SET bal = bal + 1 WHERE id = 1 + +# Without this barrier s3's SELECT can reach the lock manager before s2's +# UPDATE does, take S alongside s1, and prevent s2 from ever being +# promoted (writer fairness only applies when the X request is already +# queued). Wait until Tidesdb_lock_waits increments to confirm s2 is +# parked in the wait queue before firing s3. +connection default; +let $wait_condition = + SELECT (VARIABLE_VALUE+0) > $waits_before_s2 + FROM information_schema.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_WAITS'; +--source include/wait_condition.inc + +connection s3; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +--send SELECT bal FROM acct WHERE id = 1 + +connection s1; +COMMIT; + +connection s2; +--reap +COMMIT; + +# The bal value s3 reads is intentionally not recorded. Under +# REPEATABLE-READ the snapshot is taken at first data access, and that +# moment is racey relative to s2's commit. If s3's thread reaches +# external_lock before s2 commits the read returns 200; if it reaches it +# after, the read returns 201. Both are valid REPEATABLE-READ. What +# the test actually proves -- writer fairness, that s3's S grant was +# deferred behind s2's queued X -- is demonstrated by the --send / +# --reap ordering above, not by the value. +connection s3; +--disable_result_log +--reap +--enable_result_log +COMMIT; + +connection default; +--echo # s2 incremented 200 -> 201; s3 then read either 200 or 201 (both valid) +SELECT bal FROM acct WHERE id = 1; + +--echo # +--echo # TEST 4: READ-COMMITTED reads take no lock +--echo # s1 holds an uncommitted X via UPDATE; s2 under RC reads +--echo # the latest committed value without blocking. +--echo # + +UPDATE acct SET bal = 300 WHERE id = 1; + +connection s1; +BEGIN; +UPDATE acct SET bal = bal + 100 WHERE id = 1; + +connection s2; +SET SESSION transaction_isolation = 'READ-COMMITTED'; +BEGIN; +SELECT bal FROM acct WHERE id = 1; +COMMIT; + +connection s1; +COMMIT; + +connection default; +--echo # 300 + 100 = 400 +SELECT bal FROM acct WHERE id = 1; + +--echo # +--echo # Cleanup +--echo # +disconnect s1; +disconnect s2; +disconnect s3; +connection default; +DROP TABLE acct; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_timeout.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_timeout.test new file mode 100644 index 0000000000000..b5b9c79d647e7 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_timeout.test @@ -0,0 +1,60 @@ +--source include/have_tidesdb.inc +# +# Bounded lock-wait. tidesdb_lock_wait_timeout_ms controls how long a +# pessimistic acquire blocks before giving up. This test sets a short +# 300 ms timeout on connection b, has connection a hold X on row 1, +# then issues UPDATE on b and confirms it returns +# ER_LOCK_WAIT_TIMEOUT within the budget rather than waiting forever. +# It also checks that tidesdb_lock_timeouts increments by exactly one. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE w ( + id INT PRIMARY KEY, + v INT NOT NULL +) ENGINE=TidesDB; + +INSERT INTO w VALUES (1, 100); + +connect (a, localhost, root,,); +connect (b, localhost, root,,); + +connection default; +let $timeouts_before = + `SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_TIMEOUTS'`; + +connection a; +BEGIN; +UPDATE w SET v = v + 1 WHERE id = 1; + +connection b; +SET SESSION tidesdb_lock_wait_timeout_ms = 300; +BEGIN; +--error ER_LOCK_WAIT_TIMEOUT +UPDATE w SET v = v + 1 WHERE id = 1; +ROLLBACK; + +connection a; +COMMIT; + +connection default; +let $timeouts_after = + `SELECT VARIABLE_VALUE FROM information_schema.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'TIDESDB_LOCK_TIMEOUTS'`; + +--disable_query_log +eval SELECT $timeouts_after - $timeouts_before AS timeout_delta; +--enable_query_log + +--echo # Row 1 incremented by T1 only. +SELECT * FROM w WHERE id = 1; + +disconnect a; +disconnect b; +connection default; +DROP TABLE w; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_upgrade.test b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_upgrade.test new file mode 100644 index 0000000000000..67c3c0d0e845b --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pessimistic_upgrade.test @@ -0,0 +1,68 @@ +--source include/have_tidesdb.inc +# +# S to X upgrade behaviour. The lock manager allows a sole S holder to +# upgrade in place. When another trx also holds S the upgrade cannot +# succeed without blocking on the trx's own S grant, so the manager +# rejects the request with HA_ERR_LOCK_DEADLOCK rather than create a +# self-deadlock. Two scenarios verify both branches. +# + +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); + +CREATE TABLE u ( + id INT PRIMARY KEY, + v INT NOT NULL +) ENGINE=TidesDB; + +INSERT INTO u VALUES (1, 100); + +connect (a, localhost, root,,); +connect (b, localhost, root,,); + +--echo # +--echo # Scenario 1, sole holder upgrades cleanly. +--echo # +connection a; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM u WHERE id = 1; +UPDATE u SET v = v + 10 WHERE id = 1; +COMMIT; + +connection default; +SELECT * FROM u WHERE id = 1; + +--echo # +--echo # Scenario 2, two S holders, one tries to upgrade, must be rejected. +--echo # +connection a; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM u WHERE id = 1; + +connection b; +SET SESSION transaction_isolation = 'REPEATABLE-READ'; +BEGIN; +SELECT v FROM u WHERE id = 1; + +# a holds S, b holds S. a tries to upgrade to X. Allowed-when-sole +# rule fails, manager returns HA_ERR_LOCK_DEADLOCK. +connection a; +--error ER_LOCK_DEADLOCK +UPDATE u SET v = v + 1 WHERE id = 1; +ROLLBACK; + +connection b; +COMMIT; + +connection default; +--echo # Row 1 unchanged from scenario 2. +SELECT * FROM u WHERE id = 1; + +disconnect a; +disconnect b; +connection default; +DROP TABLE u; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_pk_index.test b/mysql-test/suite/tidesdb/t/tidesdb_pk_index.test new file mode 100644 index 0000000000000..c6551cb9b68dd --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_pk_index.test @@ -0,0 +1,176 @@ +--source include/have_tidesdb.inc +--disable_warnings +DROP TABLE IF EXISTS t_pk, t_autoinc, t_secidx, t_combined; +--enable_warnings + +--replace_regex /\.dll/.so/ + +--echo # +--echo # ============================================ +--echo # TEST 1: PRIMARY KEY - point lookups & range +--echo # ============================================ +--echo # + +CREATE TABLE t_pk ( + id INT NOT NULL PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB; + +INSERT INTO t_pk VALUES (10, 'ten'), (20, 'twenty'), (30, 'thirty'); + +--echo # Point lookup by PK +SELECT * FROM t_pk WHERE id = 20; + +--echo # Range scan on PK +SELECT * FROM t_pk WHERE id >= 15 AND id <= 25; + +--echo # Full scan (should still work) +SELECT * FROM t_pk ORDER BY id; + +--echo # UPDATE via PK lookup +UPDATE t_pk SET val = 'TWO-ZERO' WHERE id = 20; +SELECT * FROM t_pk WHERE id = 20; + +--echo # DELETE via PK lookup +DELETE FROM t_pk WHERE id = 10; +SELECT * FROM t_pk ORDER BY id; + +DROP TABLE t_pk; + + +--echo # +--echo # ============================================ +--echo # TEST 2: AUTO_INCREMENT +--echo # ============================================ +--echo # + +CREATE TABLE t_autoinc ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(50) +) ENGINE=TIDESDB; + +INSERT INTO t_autoinc (name) VALUES ('alice'); +INSERT INTO t_autoinc (name) VALUES ('bob'); +INSERT INTO t_autoinc (name) VALUES ('carol'); + +SELECT * FROM t_autoinc ORDER BY id; + +--echo # Explicit id should also work +INSERT INTO t_autoinc (id, name) VALUES (100, 'dave'); +SELECT * FROM t_autoinc WHERE id = 100; + +--echo # Next auto-inc should continue past 100 +INSERT INTO t_autoinc (name) VALUES ('eve'); +SELECT * FROM t_autoinc ORDER BY id; + +DROP TABLE t_autoinc; + + +--echo # +--echo # ============================================ +--echo # TEST 3: Secondary index (KEY) +--echo # ============================================ +--echo # + +CREATE TABLE t_secidx ( + id INT NOT NULL PRIMARY KEY, + k INT NOT NULL, + val VARCHAR(50), + KEY k_idx (k) +) ENGINE=TIDESDB; + +INSERT INTO t_secidx VALUES (1, 100, 'a'), (2, 200, 'b'), (3, 100, 'c'), (4, 300, 'd'); + +--echo # Lookup via secondary index +SELECT * FROM t_secidx WHERE k = 100 ORDER BY id; +SELECT * FROM t_secidx WHERE k = 200; + +--echo # Range on secondary index +SELECT * FROM t_secidx WHERE k >= 200 ORDER BY k; + +--echo # UPDATE a row and verify secondary index is maintained +UPDATE t_secidx SET k = 999 WHERE id = 2; +SELECT * FROM t_secidx WHERE k = 200; +SELECT * FROM t_secidx WHERE k = 999; + +--echo # DELETE and verify index entry removed +DELETE FROM t_secidx WHERE id = 3; +SELECT * FROM t_secidx WHERE k = 100 ORDER BY id; + +DROP TABLE t_secidx; + + +--echo # +--echo # ============================================ +--echo # TEST 4: Combined PK + AUTO_INCREMENT + secondary index +--echo # (sysbench-like schema) +--echo # ============================================ +--echo # + +CREATE TABLE t_combined ( + id INT NOT NULL AUTO_INCREMENT, + k INT NOT NULL DEFAULT 0, + c CHAR(120) NOT NULL DEFAULT '', + pad CHAR(60) NOT NULL DEFAULT '', + PRIMARY KEY (id), + KEY k_1 (k) +) ENGINE=TIDESDB; + +--echo # Insert rows (sysbench-style) +INSERT INTO t_combined (k, c, pad) VALUES + (1, REPEAT('a', 120), REPEAT('x', 60)), + (2, REPEAT('b', 120), REPEAT('y', 60)), + (3, REPEAT('c', 120), REPEAT('z', 60)), + (1, REPEAT('d', 120), REPEAT('w', 60)); + +SELECT id, k, LENGTH(c) AS c_len, LENGTH(pad) AS pad_len FROM t_combined ORDER BY id; + +--echo # Point select by PK (sysbench oltp_point_select) +SELECT id, k FROM t_combined WHERE id = 2; + +--echo # Range select by PK +SELECT id, k FROM t_combined WHERE id BETWEEN 2 AND 3 ORDER BY id; + +--echo # Lookup via secondary index +SELECT id, k FROM t_combined WHERE k = 1 ORDER BY id; + +--echo # Update indexed column (sysbench oltp_update_index) +UPDATE t_combined SET k = k + 1 WHERE id = 1; +SELECT id, k FROM t_combined WHERE id = 1; + +--echo # Verify old index entry gone, new one present +SELECT id, k FROM t_combined WHERE k = 1 ORDER BY id; +SELECT id, k FROM t_combined WHERE k = 2 ORDER BY id; + +--echo # Delete +DELETE FROM t_combined WHERE id = 3; +SELECT COUNT(*) AS cnt FROM t_combined; + +--echo # TRUNCATE +TRUNCATE TABLE t_combined; +SELECT COUNT(*) AS cnt FROM t_combined; + +DROP TABLE t_combined; + + +--echo # +--echo # ============================================ +--echo # TEST 5: BIGINT PRIMARY KEY +--echo # ============================================ +--echo # + +CREATE TABLE t_bigpk ( + id BIGINT NOT NULL PRIMARY KEY, + val VARCHAR(20) +) ENGINE=TIDESDB; + +INSERT INTO t_bigpk VALUES (9223372036854775806, 'near_max'); +INSERT INTO t_bigpk VALUES (1, 'one'); +INSERT INTO t_bigpk VALUES (9223372036854775807, 'max'); + +SELECT * FROM t_bigpk ORDER BY id; +SELECT * FROM t_bigpk WHERE id = 9223372036854775807; + +DROP TABLE t_bigpk; +--source suite/tidesdb/include/cleanup_tidesdb.inc + diff --git a/mysql-test/suite/tidesdb/t/tidesdb_rename.test b/mysql-test/suite/tidesdb/t/tidesdb_rename.test new file mode 100644 index 0000000000000..27ed8a29d0842 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_rename.test @@ -0,0 +1,195 @@ +--source include/have_tidesdb.inc +# +# Test suite for TIDESDB rename_table functionality. +# Covers: RENAME TABLE, ALTER TABLE (table copy), ALTER TABLE with option changes, +# and secondary index preservation across renames. +# + +--echo # +--echo # === Setup: install the TIDESDB engine plugin === +--echo # +--replace_regex /\.dll/.so/ + +--echo # +--echo # ============================================ +--echo # TEST 1: Basic RENAME TABLE +--echo # ============================================ +--echo # + +CREATE TABLE t_orig (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_orig VALUES (1, 'alpha'), (2, 'beta'), (3, 'gamma'); +SELECT * FROM t_orig ORDER BY id; + +RENAME TABLE t_orig TO t_renamed; + +# Old name should not exist +--error ER_NO_SUCH_TABLE +SELECT * FROM t_orig; + +# New name should have all data +SELECT * FROM t_renamed ORDER BY id; + +# DML on renamed table should work +INSERT INTO t_renamed VALUES (4, 'delta'); +UPDATE t_renamed SET val = 'BETA' WHERE id = 2; +DELETE FROM t_renamed WHERE id = 3; +SELECT * FROM t_renamed ORDER BY id; + +DROP TABLE t_renamed; + +--echo # +--echo # ============================================ +--echo # TEST 2: RENAME TABLE with secondary index +--echo # ============================================ +--echo # + +CREATE TABLE t_idx ( + id INT PRIMARY KEY, + name VARCHAR(50) NOT NULL, + KEY idx_name (name) +) ENGINE=TIDESDB; + +INSERT INTO t_idx VALUES (1, 'alice'), (2, 'bob'), (3, 'charlie'), (4, 'alice'); + +# Verify index scan works before rename +SELECT id, name FROM t_idx WHERE name = 'alice' ORDER BY id; + +RENAME TABLE t_idx TO t_idx_new; + +# Index scan should still work after rename +SELECT id, name FROM t_idx_new WHERE name = 'alice' ORDER BY id; +SELECT id, name FROM t_idx_new WHERE name = 'bob'; + +# Insert + index lookup on renamed table +INSERT INTO t_idx_new VALUES (5, 'bob'); +SELECT id, name FROM t_idx_new WHERE name = 'bob' ORDER BY id; + +DROP TABLE t_idx_new; + +--echo # +--echo # ============================================ +--echo # TEST 3: ALTER TABLE changes table options +--echo # ============================================ +--echo # + +CREATE TABLE t_alter (id INT PRIMARY KEY, val VARCHAR(100)) ENGINE=TIDESDB; +INSERT INTO t_alter VALUES (1, 'before'), (2, 'alter'), (3, 'table'); +SELECT * FROM t_alter ORDER BY id; + +SHOW CREATE TABLE t_alter; + +# ALTER TABLE to change SYNC_MODE (triggers full table copy + rename) +ALTER TABLE t_alter SYNC_MODE='NONE'; +SHOW CREATE TABLE t_alter; + +# Data must survive the ALTER +SELECT * FROM t_alter ORDER BY id; + +# DML must work after ALTER +INSERT INTO t_alter VALUES (4, 'after_alter'); +UPDATE t_alter SET val = 'ALTERED' WHERE id = 2; +DELETE FROM t_alter WHERE id = 1; +SELECT * FROM t_alter ORDER BY id; + +DROP TABLE t_alter; + +--echo # +--echo # ============================================ +--echo # TEST 4: ALTER TABLE ADD COLUMN (schema change) +--echo # ============================================ +--echo # + +CREATE TABLE t_schema (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_schema VALUES (1, 'one'), (2, 'two'); + +ALTER TABLE t_schema ADD COLUMN extra INT DEFAULT 0; +SHOW CREATE TABLE t_schema; + +# Existing rows should have default for new column +SELECT * FROM t_schema ORDER BY id; + +# New inserts should use all columns +INSERT INTO t_schema VALUES (3, 'three', 99); +SELECT * FROM t_schema ORDER BY id; + +DROP TABLE t_schema; + +--echo # +--echo # ============================================ +--echo # TEST 5: ALTER TABLE with secondary index +--echo # ============================================ +--echo # + +CREATE TABLE t_altidx ( + id INT PRIMARY KEY, + name VARCHAR(50) NOT NULL, + KEY idx_name (name) +) ENGINE=TIDESDB; + +INSERT INTO t_altidx VALUES (1, 'alice'), (2, 'bob'), (3, 'charlie'); + +# Index works before ALTER +SELECT id FROM t_altidx WHERE name = 'bob'; + +# ALTER TABLE option change (full copy with index rebuild) +ALTER TABLE t_altidx SYNC_MODE='NONE'; + +# Index must still work after ALTER +SELECT id FROM t_altidx WHERE name = 'bob'; +SELECT id FROM t_altidx WHERE name = 'alice'; + +# Full scan still works +SELECT * FROM t_altidx ORDER BY id; + +# DML + index after ALTER +INSERT INTO t_altidx VALUES (4, 'alice'); +SELECT id FROM t_altidx WHERE name = 'alice' ORDER BY id; + +DROP TABLE t_altidx; + +--echo # +--echo # ============================================ +--echo # TEST 6: Double rename +--echo # ============================================ +--echo # + +CREATE TABLE t_a (id INT PRIMARY KEY, val INT) ENGINE=TIDESDB; +INSERT INTO t_a VALUES (1, 10), (2, 20); + +RENAME TABLE t_a TO t_b; +SELECT * FROM t_b ORDER BY id; + +RENAME TABLE t_b TO t_c; +SELECT * FROM t_c ORDER BY id; + +# Original names should not exist +--error ER_NO_SUCH_TABLE +SELECT * FROM t_a; +--error ER_NO_SUCH_TABLE +SELECT * FROM t_b; + +DROP TABLE t_c; + +--echo # +--echo # ============================================ +--echo # TEST 7: ALTER TABLE without explicit PK (hidden PK) +--echo # ============================================ +--echo # + +CREATE TABLE t_nopk (val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO t_nopk VALUES ('row1'), ('row2'), ('row3'); +SELECT * FROM t_nopk; + +ALTER TABLE t_nopk SYNC_MODE='NONE'; +SELECT * FROM t_nopk; + +INSERT INTO t_nopk VALUES ('row4'); +SELECT * FROM t_nopk; + +DROP TABLE t_nopk; + +--echo # +--echo # + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_replace_iodku.test b/mysql-test/suite/tidesdb/t/tidesdb_replace_iodku.test new file mode 100644 index 0000000000000..8ed8c785673e0 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_replace_iodku.test @@ -0,0 +1,178 @@ +--source include/have_tidesdb.inc +# +# Tests for REPLACE INTO and INSERT ON DUPLICATE KEY UPDATE. +# These exercise the dup_ref / HA_ERR_FOUND_DUPP_KEY path in the handler. +# + +--echo # +--echo # ============================================ +--echo # TEST 1: REPLACE INTO - PK only table +--echo # ============================================ +--echo # + +CREATE TABLE t_rep ( + id INT NOT NULL PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB; + +INSERT INTO t_rep VALUES (1, 'one'), (2, 'two'), (3, 'three'); +SELECT * FROM t_rep ORDER BY id; + +--echo # REPLACE existing row (id=2) +REPLACE INTO t_rep VALUES (2, 'TWO-replaced'); +SELECT * FROM t_rep ORDER BY id; + +--echo # REPLACE non-existing row (id=4) +REPLACE INTO t_rep VALUES (4, 'four-new'); +SELECT * FROM t_rep ORDER BY id; + +--echo # REPLACE multiple rows at once +REPLACE INTO t_rep VALUES (1, 'ONE-replaced'), (3, 'THREE-replaced'), (5, 'five-new'); +SELECT * FROM t_rep ORDER BY id; + +DROP TABLE t_rep; + + +--echo # +--echo # ============================================ +--echo # TEST 2: REPLACE INTO - PK + secondary index +--echo # (verifies old secondary index entries are +--echo # properly cleaned up) +--echo # ============================================ +--echo # + +CREATE TABLE t_rep_idx ( + id INT NOT NULL PRIMARY KEY, + k INT NOT NULL, + val VARCHAR(50), + KEY k_idx (k) +) ENGINE=TIDESDB; + +INSERT INTO t_rep_idx VALUES (1, 100, 'a'), (2, 200, 'b'), (3, 100, 'c'); + +--echo # Before REPLACE: k=100 has 2 rows +SELECT * FROM t_rep_idx WHERE k = 100 ORDER BY id; + +--echo # REPLACE id=1, changing k from 100 to 999 +REPLACE INTO t_rep_idx VALUES (1, 999, 'a-replaced'); +SELECT * FROM t_rep_idx ORDER BY id; + +--echo # After REPLACE: k=100 should have only 1 row (id=3) +SELECT * FROM t_rep_idx WHERE k = 100 ORDER BY id; +--echo # k=999 should have 1 row (id=1) +SELECT * FROM t_rep_idx WHERE k = 999; + +--echo # REPLACE id=3, keeping k=100 +REPLACE INTO t_rep_idx VALUES (3, 100, 'c-replaced'); +SELECT * FROM t_rep_idx WHERE k = 100 ORDER BY id; + +DROP TABLE t_rep_idx; + + +--echo # +--echo # ============================================ +--echo # TEST 3: INSERT ON DUPLICATE KEY UPDATE - PK +--echo # ============================================ +--echo # + +CREATE TABLE t_iodku ( + id INT NOT NULL PRIMARY KEY, + val INT NOT NULL DEFAULT 0 +) ENGINE=TIDESDB; + +INSERT INTO t_iodku VALUES (1, 100), (2, 200), (3, 300); +SELECT * FROM t_iodku ORDER BY id; + +--echo # IODKU: duplicate on id=2 => update val +INSERT INTO t_iodku VALUES (2, 0) ON DUPLICATE KEY UPDATE val = val + 1; +SELECT * FROM t_iodku ORDER BY id; + +--echo # IODKU: no duplicate on id=4 => insert +INSERT INTO t_iodku VALUES (4, 400) ON DUPLICATE KEY UPDATE val = val + 1; +SELECT * FROM t_iodku ORDER BY id; + +--echo # IODKU: multiple rows (some dups, some new) +INSERT INTO t_iodku VALUES (1, 0), (5, 500), (3, 0) + ON DUPLICATE KEY UPDATE val = val + 10; +SELECT * FROM t_iodku ORDER BY id; + +DROP TABLE t_iodku; + + +--echo # +--echo # ============================================ +--echo # TEST 4: IODKU with secondary index +--echo # ============================================ +--echo # + +CREATE TABLE t_iodku_idx ( + id INT NOT NULL PRIMARY KEY, + k INT NOT NULL, + val VARCHAR(50), + KEY k_idx (k) +) ENGINE=TIDESDB; + +INSERT INTO t_iodku_idx VALUES (1, 10, 'orig-1'), (2, 20, 'orig-2'); + +--echo # IODKU duplicate on PK, changes indexed column k +INSERT INTO t_iodku_idx VALUES (1, 99, 'new-1') + ON DUPLICATE KEY UPDATE k = VALUES(k), val = VALUES(val); +SELECT * FROM t_iodku_idx ORDER BY id; +--echo # Old k=10 should be gone, k=99 should have id=1 +SELECT * FROM t_iodku_idx WHERE k = 10; +SELECT * FROM t_iodku_idx WHERE k = 99; + +DROP TABLE t_iodku_idx; + + +--echo # +--echo # ============================================ +--echo # TEST 5: IODKU with unique secondary index +--echo # ============================================ +--echo # + +CREATE TABLE t_iodku_uniq ( + id INT NOT NULL PRIMARY KEY, + email VARCHAR(100) NOT NULL, + cnt INT NOT NULL DEFAULT 0, + UNIQUE KEY uk_email (email) +) ENGINE=TIDESDB; + +INSERT INTO t_iodku_uniq VALUES (1, 'alice@test.com', 1); +INSERT INTO t_iodku_uniq VALUES (2, 'bob@test.com', 1); + +--echo # IODKU conflict on unique secondary index (email) +INSERT INTO t_iodku_uniq VALUES (3, 'alice@test.com', 1) + ON DUPLICATE KEY UPDATE cnt = cnt + 1; +SELECT * FROM t_iodku_uniq ORDER BY id; + +DROP TABLE t_iodku_uniq; + + +--echo # +--echo # ============================================ +--echo # TEST 6: REPLACE with AUTO_INCREMENT +--echo # ============================================ +--echo # + +CREATE TABLE t_rep_auto ( + id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB; + +INSERT INTO t_rep_auto (val) VALUES ('first'), ('second'), ('third'); +SELECT * FROM t_rep_auto ORDER BY id; + +REPLACE INTO t_rep_auto VALUES (2, 'second-replaced'); +SELECT * FROM t_rep_auto ORDER BY id; + +--echo # Next auto_inc should be > 3 +INSERT INTO t_rep_auto (val) VALUES ('fourth'); +SELECT * FROM t_rep_auto ORDER BY id; + +DROP TABLE t_rep_auto; + + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_savepoint.opt b/mysql-test/suite/tidesdb/t/tidesdb_savepoint.opt new file mode 100644 index 0000000000000..314429e22d2af --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_savepoint.opt @@ -0,0 +1 @@ +--loose-tidesdb-savepoint-test=1 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_savepoint.test b/mysql-test/suite/tidesdb/t/tidesdb_savepoint.test new file mode 100644 index 0000000000000..bf9962a2bbf00 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_savepoint.test @@ -0,0 +1,31 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc + +--echo # +--echo # ============================================ +--echo # TEST: SQL SAVEPOINT support +--echo # ============================================ +--echo # + +CREATE TABLE t_sp ( + id INT PRIMARY KEY, + v INT +) ENGINE=TIDESDB; + +--echo # SAVEPOINT should work inside an explicit transaction +START TRANSACTION; +INSERT INTO t_sp VALUES (1, 10); +SAVEPOINT a; +INSERT INTO t_sp VALUES (2, 20); +ROLLBACK TO SAVEPOINT a; +INSERT INTO t_sp VALUES (3, 30); +RELEASE SAVEPOINT a; +COMMIT; + +SELECT * FROM t_sp ORDER BY id; + +DROP TABLE t_sp; + +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_single_delete.test b/mysql-test/suite/tidesdb/t/tidesdb_single_delete.test new file mode 100644 index 0000000000000..5a20ce0fe12bf --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_single_delete.test @@ -0,0 +1,200 @@ +--source include/have_tidesdb.inc +# +# Test coverage for the tidesdb_single_delete_primary session variable +# and the unconditional single-delete semantics on secondary-index CFs. +# +# The single-delete contract is "at most one put between single-deletes +# on the same key". Secondary-index CFs satisfy that by construction +# for every (col_values, pk) composite and use single-delete on every +# delete path automatically. The primary CF only satisfies the +# contract when the session does no UPDATE on non-PK columns and no +# REPLACE INTO / INSERT ... ON DUPLICATE KEY UPDATE overwrite path on +# tables without secondary indexes -- the session variable is the +# caller's explicit promise. +# + +--echo # +--echo # === sysvar: default is OFF === +--echo # + +SHOW VARIABLES LIKE 'tidesdb_single_delete_primary'; +SELECT @@SESSION.tidesdb_single_delete_primary; + +--echo # +--echo # === Secondary-index single-delete is always on (no flag needed). === +--echo # Reads must remain correct across INSERT, SELECT, UPDATE, DELETE on a +--echo # table with multiple secondary indexes. This exercises update_row's +--echo # old-entry delete path and delete_row's secondary-index dispatch loop. +--echo # + +CREATE TABLE t_sec ( + pk BIGINT PRIMARY KEY, + c0 INT, + c1 INT, + c2 INT, + KEY k0 (c0), + KEY k1 (c1), + KEY k2 (c2) +) ENGINE=TIDESDB; + +INSERT INTO t_sec VALUES (1,10,100,1000),(2,20,200,2000),(3,30,300,3000); + +SELECT * FROM t_sec ORDER BY pk; +SELECT pk FROM t_sec WHERE c0 = 20; +SELECT pk FROM t_sec WHERE c1 = 300; +SELECT pk FROM t_sec WHERE c2 = 1000; + +UPDATE t_sec SET c0 = 11, c1 = 111 WHERE pk = 1; + +SELECT * FROM t_sec ORDER BY pk; +SELECT pk FROM t_sec WHERE c0 = 10; +SELECT pk FROM t_sec WHERE c0 = 11; +SELECT pk FROM t_sec WHERE c1 = 100; +SELECT pk FROM t_sec WHERE c1 = 111; + +DELETE FROM t_sec WHERE pk = 2; + +SELECT * FROM t_sec ORDER BY pk; +SELECT pk FROM t_sec WHERE c0 = 20; +SELECT pk FROM t_sec WHERE c1 = 200; + +DELETE FROM t_sec; +SELECT COUNT(*) FROM t_sec; + +--echo # +--echo # REPLACE INTO on a table with secondary indexes: the server routes +--echo # through delete_row + write_row, so each specific (col_vals, pk) is +--echo # still put-once-delete-once. Secondary-index single-delete stays +--echo # safe. +--echo # + +INSERT INTO t_sec VALUES (5,50,500,5000); +REPLACE INTO t_sec VALUES (5,55,555,5555); +SELECT * FROM t_sec WHERE pk = 5; +SELECT pk FROM t_sec WHERE c0 = 50; +SELECT pk FROM t_sec WHERE c0 = 55; + +DROP TABLE t_sec; + +--echo # +--echo # === Primary-CF single-delete under the sysvar: insert-then-delete. === +--echo # The contract holds because we only INSERT and DELETE -- no UPDATE, +--echo # no REPLACE. Reads must agree with the non-sysvar baseline. +--echo # + +SET SESSION tidesdb_single_delete_primary = 1; +SELECT @@SESSION.tidesdb_single_delete_primary; + +CREATE TABLE t_pri ( + pk BIGINT PRIMARY KEY, + v VARCHAR(32) +) ENGINE=TIDESDB; + +INSERT INTO t_pri VALUES (1,'a'),(2,'b'),(3,'c'),(4,'d'),(5,'e'); +SELECT * FROM t_pri ORDER BY pk; + +DELETE FROM t_pri WHERE pk IN (2,4); +SELECT * FROM t_pri ORDER BY pk; + +DELETE FROM t_pri; +SELECT COUNT(*) FROM t_pri; + +--echo # +--echo # Insert a fresh batch, delete every row, read nothing back. This +--echo # matches the iibench-shaped workload. +--echo # + +INSERT INTO t_pri VALUES (10,'x'),(20,'y'),(30,'z'),(40,'w'),(50,'v'); +SELECT COUNT(*) FROM t_pri; +DELETE FROM t_pri; +SELECT COUNT(*) FROM t_pri; + +DROP TABLE t_pri; + +--echo # +--echo # === Primary-CF single-delete with secondary indexes present. === +--echo # Secondary-index SD is already unconditional; primary-CF SD is gated +--echo # on the sysvar. Together they cover all four CFs per delete on +--echo # Mark's num_secondary_indexes=3 table shape. +--echo # + +CREATE TABLE t_mark ( + transactionid BIGINT PRIMARY KEY, + c0 INT, + c1 INT, + c2 INT, + KEY (c0), + KEY (c1), + KEY (c2) +) ENGINE=TIDESDB; + +INSERT INTO t_mark VALUES (1,10,100,1000),(2,20,200,2000),(3,30,300,3000), + (4,40,400,4000),(5,50,500,5000); +SELECT COUNT(*) FROM t_mark; +SELECT transactionid FROM t_mark WHERE c1 = 300; + +DELETE FROM t_mark WHERE transactionid >= 2 ORDER BY transactionid ASC LIMIT 2; +SELECT transactionid FROM t_mark ORDER BY transactionid; +SELECT transactionid FROM t_mark WHERE c0 = 20; +SELECT transactionid FROM t_mark WHERE c2 = 3000; + +DELETE FROM t_mark; +SELECT COUNT(*) FROM t_mark; + +DROP TABLE t_mark; + +SET SESSION tidesdb_single_delete_primary = 0; + +--echo # +--echo # === Sysvar OFF across UPDATE + REPLACE paths (safety baseline). === +--echo # Any workload that uses UPDATE non-PK / REPLACE INTO on no-secondary +--echo # tables must stay correct with the sysvar OFF, because primary-CF SD +--echo # is unsafe under those patterns. Secondary-index SD is independent +--echo # of the sysvar. +--echo # + +CREATE TABLE t_upd ( + pk BIGINT PRIMARY KEY, + c0 INT, + KEY (c0) +) ENGINE=TIDESDB; + +INSERT INTO t_upd VALUES (1,100),(2,200),(3,300); + +UPDATE t_upd SET c0 = 999 WHERE pk = 2; +SELECT * FROM t_upd ORDER BY pk; +SELECT pk FROM t_upd WHERE c0 = 200; +SELECT pk FROM t_upd WHERE c0 = 999; + +DELETE FROM t_upd WHERE pk = 2; +SELECT * FROM t_upd ORDER BY pk; +SELECT pk FROM t_upd WHERE c0 = 999; + +DROP TABLE t_upd; + +--echo # +--echo # REPLACE INTO on a no-secondary table follows the line-5143 "overwrite +--echo # silently" fast path. With sysvar OFF (default), subsequent DELETEs +--echo # remain correct because the regular tombstone is used. +--echo # + +CREATE TABLE t_rep ( + pk BIGINT PRIMARY KEY, + v VARCHAR(32) +) ENGINE=TIDESDB; + +INSERT INTO t_rep VALUES (1,'first'); +REPLACE INTO t_rep VALUES (1,'second'); +SELECT * FROM t_rep; + +DELETE FROM t_rep WHERE pk = 1; +SELECT COUNT(*) FROM t_rep; +SELECT * FROM t_rep; + +INSERT INTO t_rep VALUES (1,'third'); +SELECT * FROM t_rep; + +DROP TABLE t_rep; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_spatial.test b/mysql-test/suite/tidesdb/t/tidesdb_spatial.test new file mode 100644 index 0000000000000..b6307c573f987 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_spatial.test @@ -0,0 +1,103 @@ +--source include/have_tidesdb.inc +# +# Test: Spatial indexes (Hilbert curve, MBR predicates) +# +# Covers: +# 1. CREATE TABLE with SPATIAL INDEX +# 2. INSERT POINT geometries +# 3. MBRIntersects query +# 4. MBRContains query +# 5. MBRWithin query +# 6. UPDATE geometry column +# 7. DELETE geometry row +# + +--echo # +--echo # Setup +--echo # + +CREATE TABLE places ( + id INT NOT NULL PRIMARY KEY, + name VARCHAR(100), + loc GEOMETRY NOT NULL, + SPATIAL INDEX (loc) +) ENGINE=TidesDB; + +INSERT INTO places VALUES (1, 'NYC', ST_GeomFromText('POINT(40.7128 -74.0060)')); +INSERT INTO places VALUES (2, 'LA', ST_GeomFromText('POINT(34.0522 -118.2437)')); +INSERT INTO places VALUES (3, 'Chicago', ST_GeomFromText('POINT(41.8781 -87.6298)')); +INSERT INTO places VALUES (4, 'Houston', ST_GeomFromText('POINT(29.7604 -95.3698)')); +INSERT INTO places VALUES (5, 'Phoenix', ST_GeomFromText('POINT(33.4484 -112.074)')); + +--echo # +--echo # TEST 1: MBRIntersects - find cities near northeast US +--echo # + +SELECT name FROM places +WHERE MBRIntersects(loc, + ST_GeomFromText('POLYGON((39 -76, 43 -76, 43 -72, 39 -72, 39 -76))')) +ORDER BY name; + +--echo # +--echo # TEST 2: MBRContains - all cities within big US box +--echo # + +SELECT name FROM places +WHERE MBRContains( + ST_GeomFromText('POLYGON((25 -125, 45 -125, 45 -70, 25 -70, 25 -125))'), + loc) +ORDER BY name; + +--echo # +--echo # TEST 3: MBRWithin - same as above using MBRWithin +--echo # + +SELECT name FROM places +WHERE MBRWithin(loc, + ST_GeomFromText('POLYGON((25 -125, 45 -125, 45 -70, 25 -70, 25 -125))')) +ORDER BY name; + +--echo # +--echo # TEST 4: UPDATE geometry and verify search +--echo # + +UPDATE places SET loc = ST_GeomFromText('POINT(40.0 -74.5)') WHERE id = 1; + +SELECT name FROM places +WHERE MBRIntersects(loc, + ST_GeomFromText('POLYGON((39 -76, 43 -76, 43 -72, 39 -72, 39 -76))')) +ORDER BY name; + +--echo # +--echo # TEST 5: DELETE and verify search +--echo # + +DELETE FROM places WHERE id = 1; + +SELECT name FROM places +WHERE MBRIntersects(loc, + ST_GeomFromText('POLYGON((39 -76, 43 -76, 43 -72, 39 -72, 39 -76))')) +ORDER BY name; + +--echo # +--echo # TEST 6: Simple point-in-box +--echo # + +DROP TABLE places; +CREATE TABLE pts (id INT PRIMARY KEY, g GEOMETRY NOT NULL, SPATIAL INDEX(g)) ENGINE=TidesDB; +INSERT INTO pts VALUES (1, ST_GeomFromText('POINT(10 20)')); +INSERT INTO pts VALUES (2, ST_GeomFromText('POINT(30 40)')); +INSERT INTO pts VALUES (3, ST_GeomFromText('POINT(50 60)')); + +SELECT id FROM pts +WHERE MBRWithin(g, ST_GeomFromText('POLYGON((5 15, 35 15, 35 45, 5 45, 5 15))')) +ORDER BY id; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE pts; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_sql.test b/mysql-test/suite/tidesdb/t/tidesdb_sql.test new file mode 100644 index 0000000000000..f2d08efb0c1bc --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_sql.test @@ -0,0 +1,630 @@ +--source include/have_tidesdb.inc +# +# Comprehensive SQL coverage test for the TIDESDB storage engine. +# Exercises aggregates, joins, subqueries, GROUP BY, HAVING, UNION, +# window functions, CTEs, INSERT...SELECT, multi-table ops, etc. +# + +--echo # +--echo # ============================================ +--echo # SETUP: Create and populate test tables +--echo # ============================================ +--echo # + +CREATE TABLE departments ( + dept_id INT PRIMARY KEY, + dept_name VARCHAR(50) NOT NULL +) ENGINE=TIDESDB; + +CREATE TABLE employees ( + emp_id INT PRIMARY KEY, + name VARCHAR(100) NOT NULL, + dept_id INT NOT NULL, + salary DECIMAL(10,2) NOT NULL, + hire_date DATE NOT NULL, + KEY idx_dept (dept_id), + KEY idx_salary (salary) +) ENGINE=TIDESDB; + +CREATE TABLE projects ( + proj_id INT PRIMARY KEY, + proj_name VARCHAR(100) NOT NULL, + dept_id INT NOT NULL, + budget DECIMAL(12,2) NOT NULL, + KEY idx_proj_dept (dept_id) +) ENGINE=TIDESDB; + +CREATE TABLE emp_projects ( + emp_id INT NOT NULL, + proj_id INT NOT NULL, + hours INT NOT NULL, + PRIMARY KEY (emp_id, proj_id) +) ENGINE=TIDESDB; + +INSERT INTO departments VALUES + (1, 'Engineering'), + (2, 'Marketing'), + (3, 'Finance'), + (4, 'HR'); + +INSERT INTO employees VALUES + (1, 'Alice', 1, 95000.00, '2020-01-15'), + (2, 'Bob', 1, 88000.00, '2019-06-01'), + (3, 'Carol', 2, 72000.00, '2021-03-10'), + (4, 'Dave', 2, 68000.00, '2022-07-20'), + (5, 'Eve', 3, 105000.00, '2018-11-05'), + (6, 'Frank', 3, 92000.00, '2020-09-12'), + (7, 'Grace', 1, 78000.00, '2023-01-08'), + (8, 'Hank', 4, 65000.00, '2021-05-25'), + (9, 'Ivy', 2, 71000.00, '2020-12-01'), + (10, 'Jack', 3, 85000.00, '2022-02-14'); + +INSERT INTO projects VALUES + (100, 'Project Alpha', 1, 500000.00), + (101, 'Project Beta', 1, 300000.00), + (102, 'Campaign X', 2, 150000.00), + (103, 'Audit 2024', 3, 200000.00), + (104, 'Onboarding', 4, 50000.00); + +INSERT INTO emp_projects VALUES + (1, 100, 40), (1, 101, 20), + (2, 100, 35), (2, 101, 25), + (3, 102, 45), + (4, 102, 30), + (5, 103, 50), + (6, 103, 25), + (7, 100, 15), (7, 101, 30), + (8, 104, 40), + (9, 102, 20), + (10, 103, 35); + +--echo # +--echo # ============================================ +--echo # TEST 1: Basic aggregate functions +--echo # ============================================ +--echo # + +SELECT COUNT(*) AS total_employees FROM employees; +SELECT SUM(salary) AS total_salary FROM employees; +SELECT AVG(salary) AS avg_salary FROM employees; +SELECT MIN(salary) AS min_salary, MAX(salary) AS max_salary FROM employees; +SELECT MIN(hire_date) AS earliest_hire, MAX(hire_date) AS latest_hire FROM employees; + +--echo # +--echo # ============================================ +--echo # TEST 2: GROUP BY +--echo # ============================================ +--echo # + +SELECT dept_id, COUNT(*) AS cnt, SUM(salary) AS total_sal +FROM employees +GROUP BY dept_id +ORDER BY dept_id; + +SELECT dept_id, AVG(salary) AS avg_sal, MIN(salary) AS min_sal, MAX(salary) AS max_sal +FROM employees +GROUP BY dept_id +ORDER BY dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 3: GROUP BY with HAVING +--echo # ============================================ +--echo # + +SELECT dept_id, COUNT(*) AS cnt +FROM employees +GROUP BY dept_id +HAVING cnt >= 3 +ORDER BY dept_id; + +SELECT dept_id, AVG(salary) AS avg_sal +FROM employees +GROUP BY dept_id +HAVING avg_sal > 80000 +ORDER BY dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 4: INNER JOIN +--echo # ============================================ +--echo # + +SELECT e.name, d.dept_name, e.salary +FROM employees e +INNER JOIN departments d ON e.dept_id = d.dept_id +ORDER BY e.emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 5: LEFT JOIN +--echo # ============================================ +--echo # + +SELECT d.dept_name, e.name +FROM departments d +LEFT JOIN employees e ON d.dept_id = e.dept_id AND e.salary > 90000 +ORDER BY d.dept_id, e.emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 6: RIGHT JOIN +--echo # ============================================ +--echo # + +SELECT e.name, d.dept_name +FROM departments d +RIGHT JOIN employees e ON d.dept_id = e.dept_id +ORDER BY e.emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 7: CROSS JOIN +--echo # ============================================ +--echo # + +SELECT d.dept_name, p.proj_name +FROM departments d +CROSS JOIN projects p +WHERE d.dept_id = p.dept_id +ORDER BY d.dept_id, p.proj_id; + +--echo # +--echo # ============================================ +--echo # TEST 8: Multi-table JOIN (3 tables) +--echo # ============================================ +--echo # + +SELECT e.name, d.dept_name, p.proj_name, ep.hours +FROM employees e +JOIN departments d ON e.dept_id = d.dept_id +JOIN emp_projects ep ON e.emp_id = ep.emp_id +JOIN projects p ON ep.proj_id = p.proj_id +ORDER BY e.emp_id, p.proj_id; + +--echo # +--echo # ============================================ +--echo # TEST 9: JOIN with aggregation +--echo # ============================================ +--echo # + +SELECT d.dept_name, COUNT(e.emp_id) AS headcount, SUM(e.salary) AS total_sal +FROM departments d +LEFT JOIN employees e ON d.dept_id = e.dept_id +GROUP BY d.dept_id, d.dept_name +ORDER BY d.dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 10: Scalar subquery +--echo # ============================================ +--echo # + +SELECT name, salary, + salary - (SELECT AVG(salary) FROM employees) AS diff_from_avg +FROM employees +ORDER BY emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 11: IN subquery +--echo # ============================================ +--echo # + +SELECT name, salary +FROM employees +WHERE dept_id IN (SELECT dept_id FROM departments WHERE dept_name IN ('Engineering', 'Finance')) +ORDER BY emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 12: EXISTS subquery +--echo # ============================================ +--echo # + +SELECT d.dept_name +FROM departments d +WHERE EXISTS (SELECT 1 FROM employees e WHERE e.dept_id = d.dept_id AND e.salary > 90000) +ORDER BY d.dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 13: NOT EXISTS subquery +--echo # ============================================ +--echo # + +SELECT d.dept_name +FROM departments d +WHERE NOT EXISTS (SELECT 1 FROM projects p WHERE p.dept_id = d.dept_id AND p.budget > 400000) +ORDER BY d.dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 14: Correlated subquery +--echo # ============================================ +--echo # + +SELECT e.name, e.salary, e.dept_id +FROM employees e +WHERE e.salary = (SELECT MAX(e2.salary) FROM employees e2 WHERE e2.dept_id = e.dept_id) +ORDER BY e.dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 15: Derived table (subquery in FROM) +--echo # ============================================ +--echo # + +SELECT dept_id, avg_sal +FROM ( + SELECT dept_id, AVG(salary) AS avg_sal + FROM employees + GROUP BY dept_id +) AS dept_avg +WHERE avg_sal > 80000 +ORDER BY dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 16: UNION / UNION ALL +--echo # ============================================ +--echo # + +SELECT name, 'high' AS tier FROM employees WHERE salary >= 90000 +UNION ALL +SELECT name, 'low' AS tier FROM employees WHERE salary < 70000 +ORDER BY name; + +SELECT dept_id FROM employees +UNION +SELECT dept_id FROM projects +ORDER BY dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 17: DISTINCT +--echo # ============================================ +--echo # + +SELECT DISTINCT dept_id FROM employees ORDER BY dept_id; + +SELECT COUNT(DISTINCT dept_id) AS unique_depts FROM employees; + +--echo # +--echo # ============================================ +--echo # TEST 18: ORDER BY with LIMIT / OFFSET +--echo # ============================================ +--echo # + +SELECT name, salary FROM employees ORDER BY salary DESC LIMIT 3; + +SELECT name, salary FROM employees ORDER BY salary DESC LIMIT 3 OFFSET 3; + +--echo # +--echo # ============================================ +--echo # TEST 19: CASE expression +--echo # ============================================ +--echo # + +SELECT name, salary, + CASE + WHEN salary >= 100000 THEN 'Senior' + WHEN salary >= 80000 THEN 'Mid' + ELSE 'Junior' + END AS level +FROM employees +ORDER BY emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 20: INSERT ... SELECT +--echo # ============================================ +--echo # + +CREATE TABLE high_earners ( + emp_id INT PRIMARY KEY, + name VARCHAR(100), + salary DECIMAL(10,2) +) ENGINE=TIDESDB; + +INSERT INTO high_earners +SELECT emp_id, name, salary FROM employees WHERE salary >= 90000; + +SELECT * FROM high_earners ORDER BY emp_id; +DROP TABLE high_earners; + +--echo # +--echo # ============================================ +--echo # TEST 21: UPDATE with subquery +--echo # ============================================ +--echo # + +CREATE TABLE emp_copy AS SELECT * FROM employees; +ALTER TABLE emp_copy ENGINE=TIDESDB; + +UPDATE emp_copy SET salary = salary * 1.10 +WHERE dept_id = (SELECT dept_id FROM departments WHERE dept_name = 'Marketing'); + +SELECT emp_id, name, salary FROM emp_copy WHERE dept_id = 2 ORDER BY emp_id; +DROP TABLE emp_copy; + +--echo # +--echo # ============================================ +--echo # TEST 22: DELETE with subquery +--echo # ============================================ +--echo # + +CREATE TABLE emp_copy2 AS SELECT * FROM employees; +ALTER TABLE emp_copy2 ENGINE=TIDESDB; + +DELETE FROM emp_copy2 +WHERE dept_id NOT IN (SELECT dept_id FROM departments WHERE dept_name IN ('Engineering', 'Finance')); + +SELECT emp_id, name FROM emp_copy2 ORDER BY emp_id; +DROP TABLE emp_copy2; + +--echo # +--echo # ============================================ +--echo # TEST 23: REPLACE INTO +--echo # ============================================ +--echo # + +CREATE TABLE kv_store ( + k VARCHAR(50) PRIMARY KEY, + v VARCHAR(200) +) ENGINE=TIDESDB; + +INSERT INTO kv_store VALUES ('key1', 'original'); +REPLACE INTO kv_store VALUES ('key1', 'replaced'); +REPLACE INTO kv_store VALUES ('key2', 'new'); + +SELECT * FROM kv_store ORDER BY k; +DROP TABLE kv_store; + +--echo # +--echo # ============================================ +--echo # TEST 24: Multi-column ORDER BY +--echo # ============================================ +--echo # + +SELECT dept_id, name, salary +FROM employees +ORDER BY dept_id ASC, salary DESC; + +--echo # +--echo # ============================================ +--echo # TEST 25: GROUP_CONCAT +--echo # ============================================ +--echo # + +SELECT dept_id, GROUP_CONCAT(name ORDER BY name SEPARATOR ', ') AS members +FROM employees +GROUP BY dept_id +ORDER BY dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 26: BETWEEN / IN / LIKE +--echo # ============================================ +--echo # + +SELECT name, salary FROM employees WHERE salary BETWEEN 70000 AND 90000 ORDER BY emp_id; + +SELECT name FROM employees WHERE name LIKE '%a%' ORDER BY emp_id; + +SELECT name FROM employees WHERE emp_id IN (1, 3, 5, 7, 9) ORDER BY emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 27: NULL handling +--echo # ============================================ +--echo # + +CREATE TABLE nullable_test ( + id INT PRIMARY KEY, + val VARCHAR(50), + num INT +) ENGINE=TIDESDB; + +INSERT INTO nullable_test VALUES (1, 'hello', 10), (2, NULL, 20), (3, 'world', NULL), (4, NULL, NULL); + +SELECT * FROM nullable_test ORDER BY id; +SELECT * FROM nullable_test WHERE val IS NULL ORDER BY id; +SELECT * FROM nullable_test WHERE num IS NOT NULL ORDER BY id; +SELECT COUNT(*) AS total, COUNT(val) AS non_null_val, COUNT(num) AS non_null_num FROM nullable_test; +SELECT COALESCE(val, 'N/A') AS val_or_na, COALESCE(num, 0) AS num_or_zero FROM nullable_test ORDER BY id; + +DROP TABLE nullable_test; + +--echo # +--echo # ============================================ +--echo # TEST 28: Self-join +--echo # ============================================ +--echo # + +SELECT e1.name AS employee, e2.name AS colleague +FROM employees e1 +JOIN employees e2 ON e1.dept_id = e2.dept_id AND e1.emp_id < e2.emp_id +WHERE e1.dept_id = 1 +ORDER BY e1.emp_id, e2.emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 29: Aggregate with JOIN and GROUP BY +--echo # ============================================ +--echo # + +SELECT p.proj_name, COUNT(ep.emp_id) AS team_size, SUM(ep.hours) AS total_hours +FROM projects p +LEFT JOIN emp_projects ep ON p.proj_id = ep.proj_id +GROUP BY p.proj_id, p.proj_name +ORDER BY p.proj_id; + +--echo # +--echo # ============================================ +--echo # TEST 30: Nested aggregation (max of avg) +--echo # ============================================ +--echo # + +SELECT dept_id, avg_sal FROM ( + SELECT dept_id, AVG(salary) AS avg_sal + FROM employees + GROUP BY dept_id +) t +WHERE avg_sal = ( + SELECT MAX(avg_sal) FROM ( + SELECT AVG(salary) AS avg_sal FROM employees GROUP BY dept_id + ) t2 +); + +--echo # +--echo # ============================================ +--echo # TEST 31: UNION with ORDER BY and LIMIT +--echo # ============================================ +--echo # + +(SELECT name, salary FROM employees WHERE dept_id = 1 ORDER BY salary DESC LIMIT 2) +UNION ALL +(SELECT name, salary FROM employees WHERE dept_id = 3 ORDER BY salary DESC LIMIT 2) +ORDER BY salary DESC; + +--echo # +--echo # ============================================ +--echo # TEST 32: Multi-statement transaction +--echo # ============================================ +--echo # + +BEGIN; +INSERT INTO employees VALUES (11, 'Kim', 1, 99000.00, '2024-01-01'); +UPDATE employees SET salary = salary + 1000 WHERE emp_id = 11; +SELECT emp_id, name, salary FROM employees WHERE emp_id = 11; +COMMIT; + +SELECT emp_id, name, salary FROM employees WHERE emp_id = 11; +DELETE FROM employees WHERE emp_id = 11; + +--echo # +--echo # ============================================ +--echo # TEST 33: Transaction ROLLBACK +--echo # ============================================ +--echo # + +BEGIN; +INSERT INTO employees VALUES (12, 'Leo', 2, 77000.00, '2024-02-01'); +SELECT COUNT(*) AS cnt_with_leo FROM employees WHERE emp_id = 12; +ROLLBACK; + +SELECT COUNT(*) AS cnt_after_rollback FROM employees WHERE emp_id = 12; + +--echo # +--echo # ============================================ +--echo # TEST 34: IF / IFNULL / NULLIF functions +--echo # ============================================ +--echo # + +SELECT name, + IF(salary > 90000, 'Y', 'N') AS high_earner, + NULLIF(dept_id, 4) AS dept_or_null +FROM employees +ORDER BY emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 35: String functions +--echo # ============================================ +--echo # + +SELECT name, + UPPER(name) AS upper_name, + LENGTH(name) AS name_len, + CONCAT(name, ' (', dept_id, ')') AS name_dept +FROM employees +ORDER BY emp_id +LIMIT 5; + +--echo # +--echo # ============================================ +--echo # TEST 36: Date functions +--echo # ============================================ +--echo # + +SELECT name, hire_date, + YEAR(hire_date) AS hire_year, + MONTH(hire_date) AS hire_month +FROM employees +ORDER BY emp_id +LIMIT 5; + +SELECT YEAR(hire_date) AS yr, COUNT(*) AS hired +FROM employees +GROUP BY yr +ORDER BY yr; + +--echo # +--echo # ============================================ +--echo # TEST 37: Arithmetic expressions +--echo # ============================================ +--echo # + +SELECT name, salary, + salary * 12 AS annual, + ROUND(salary / 160, 2) AS hourly_rate +FROM employees +ORDER BY emp_id +LIMIT 5; + +--echo # +--echo # ============================================ +--echo # TEST 38: HAVING with complex condition +--echo # ============================================ +--echo # + +SELECT d.dept_name, COUNT(*) AS cnt, AVG(e.salary) AS avg_sal +FROM employees e +JOIN departments d ON e.dept_id = d.dept_id +GROUP BY d.dept_id, d.dept_name +HAVING cnt >= 2 AND avg_sal > 75000 +ORDER BY d.dept_id; + +--echo # +--echo # ============================================ +--echo # TEST 39: ALL / ANY subquery +--echo # ============================================ +--echo # + +SELECT name, salary +FROM employees +WHERE salary > ALL (SELECT salary FROM employees WHERE dept_id = 2) +ORDER BY emp_id; + +SELECT name, salary +FROM employees +WHERE salary > ANY (SELECT salary FROM employees WHERE dept_id = 1) +ORDER BY emp_id; + +--echo # +--echo # ============================================ +--echo # TEST 40: CREATE TABLE ... AS SELECT +--echo # ============================================ +--echo # + +CREATE TABLE dept_summary ENGINE=TIDESDB AS +SELECT d.dept_id, d.dept_name, COUNT(e.emp_id) AS headcount, SUM(e.salary) AS total_sal +FROM departments d +LEFT JOIN employees e ON d.dept_id = e.dept_id +GROUP BY d.dept_id, d.dept_name; + +SELECT * FROM dept_summary ORDER BY dept_id; +DROP TABLE dept_summary; + +--echo # +--echo # ============================================ +--echo # CLEANUP +--echo # ============================================ +--echo # + +DROP TABLE emp_projects; +DROP TABLE projects; +DROP TABLE employees; +DROP TABLE departments; +--source suite/tidesdb/include/cleanup_tidesdb.inc diff --git a/mysql-test/suite/tidesdb/t/tidesdb_status_vars.test b/mysql-test/suite/tidesdb/t/tidesdb_status_vars.test new file mode 100644 index 0000000000000..659d640034d8d --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_status_vars.test @@ -0,0 +1,53 @@ +--source include/have_tidesdb.inc +# +# Test: SHOW GLOBAL STATUS LIKE 'tidesdb%' status variables +# + +--echo # +--echo # TEST 1: Status variables exist +--echo # + +# All 19 variables should be present +SELECT COUNT(*) >= 19 AS has_all_vars FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME LIKE 'TIDESDB%'; + +--echo # +--echo # TEST 2: Variables have reasonable values after table operations +--echo # + +CREATE TABLE t_stat (id INT PRIMARY KEY, v VARCHAR(200)) ENGINE=TidesDB; +INSERT INTO t_stat VALUES (1, REPEAT('A', 100)), (2, REPEAT('B', 100)); +SELECT * FROM t_stat ORDER BY id; + +# Force stats refresh (suppress output -- paths and counters vary per worker) +--disable_result_log +SHOW ENGINE TIDESDB STATUS; +--enable_result_log + +# Column families should be > 0 +SELECT VARIABLE_VALUE > 0 AS cf_positive FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME = 'TIDESDB_COLUMN_FAMILIES'; + +# Memory limit should be > 0 +SELECT VARIABLE_VALUE > 0 AS mem_positive FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME = 'TIDESDB_MEMORY_LIMIT'; + +# Cache partitions should be > 0 +SELECT VARIABLE_VALUE > 0 AS parts_positive FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME = 'TIDESDB_CACHE_PARTITIONS'; + +--echo # +--echo # TEST 3: All variable names are correct +--echo # + +SELECT VARIABLE_NAME FROM information_schema.GLOBAL_STATUS +WHERE VARIABLE_NAME LIKE 'TIDESDB%' ORDER BY VARIABLE_NAME; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_stat; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_stress.opt b/mysql-test/suite/tidesdb/t/tidesdb_stress.opt new file mode 100644 index 0000000000000..2c58a75714aed --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_stress.opt @@ -0,0 +1 @@ +--loose-tidesdb-stress-test=1 diff --git a/mysql-test/suite/tidesdb/t/tidesdb_stress.test b/mysql-test/suite/tidesdb/t/tidesdb_stress.test new file mode 100644 index 0000000000000..e895af87bf601 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_stress.test @@ -0,0 +1,551 @@ +--source include/have_tidesdb.inc +# +# TidesDB stress test -- concurrent operations, transaction paths, iterator +# reuse, rollback, TRUNCATE races, secondary index maintenance, and large +# batch pressure. Exercises the deferred-commit and txn_reset code paths. +# + +--echo # +--echo # === Setup === +--echo # + +CREATE TABLE stress_main ( + id INT PRIMARY KEY, + val VARCHAR(200), + score INT, + KEY idx_score (score) +) ENGINE=TIDESDB; + +CREATE TABLE stress_nopk ( + a INT, + b VARCHAR(100) +) ENGINE=TIDESDB; + +CREATE TABLE stress_wide ( + id INT PRIMARY KEY, + c1 VARCHAR(100), + c2 VARCHAR(100), + c3 INT, + c4 BIGINT, + c5 DECIMAL(10,2), + c6 DATE, + KEY idx_c3 (c3), + KEY idx_c4 (c4) +) ENGINE=TIDESDB; + +--echo # +--echo # ============================================ +--echo # TEST 1: Multi-statement transaction -- deferred commit path +--echo # Exercises: tidesdb_commit(all=false) returning early, +--echo # iterator reuse across statements, single commit at END. +--echo # ============================================ +--echo # + +BEGIN; +INSERT INTO stress_main VALUES (1, 'txn_row_1', 10); +INSERT INTO stress_main VALUES (2, 'txn_row_2', 20); +INSERT INTO stress_main VALUES (3, 'txn_row_3', 30); +UPDATE stress_main SET val = 'updated_in_txn' WHERE id = 2; +DELETE FROM stress_main WHERE id = 3; +SELECT COUNT(*) AS cnt FROM stress_main; +COMMIT; + +SELECT * FROM stress_main ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 2: Autocommit path -- each statement commits immediately +--echo # Exercises: tidesdb_commit(all=false) with autocommit (real commit). +--echo # ============================================ +--echo # + +INSERT INTO stress_main VALUES (3, 'autocommit_3', 30); +INSERT INTO stress_main VALUES (4, 'autocommit_4', 40); +UPDATE stress_main SET score = score + 100; +SELECT * FROM stress_main ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 3: Explicit ROLLBACK -- transaction-level rollback +--echo # Exercises: tidesdb_rollback(all=true), txn_reset after rollback. +--echo # ============================================ +--echo # + +BEGIN; +INSERT INTO stress_main VALUES (99, 'will_rollback', 999); +UPDATE stress_main SET val = 'dirty' WHERE id = 1; +SELECT COUNT(*) AS cnt FROM stress_main; +ROLLBACK; + +# Verify rollback took effect +SELECT * FROM stress_main ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 4: Mixed reads and writes in one transaction +--echo # Exercises: iterator reuse across read+write statements, +--echo # scan_iter surviving F_UNLCK when txn is deferred. +--echo # ============================================ +--echo # + +BEGIN; +SELECT COUNT(*) AS before_cnt FROM stress_main; +INSERT INTO stress_main VALUES (5, 'mixed_5', 50); +SELECT COUNT(*) AS mid_cnt FROM stress_main; +UPDATE stress_main SET score = 0 WHERE id = 5; +SELECT * FROM stress_main WHERE id = 5; +DELETE FROM stress_main WHERE id = 4; +SELECT COUNT(*) AS after_cnt FROM stress_main; +COMMIT; + +SELECT * FROM stress_main ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 5: Secondary index scan under transaction +--echo # Exercises: index_read_map, sec_idx_key, iterator on index CF. +--echo # ============================================ +--echo # + +BEGIN; +INSERT INTO stress_main VALUES (6, 'idx_6', 60); +INSERT INTO stress_main VALUES (7, 'idx_7', 70); +INSERT INTO stress_main VALUES (8, 'idx_8', 60); +COMMIT; + +# Index range scan +SELECT id, val, score FROM stress_main WHERE score = 60 ORDER BY id; +SELECT id, val, score FROM stress_main WHERE score >= 100 ORDER BY id; +SELECT id, val, score FROM stress_main WHERE score BETWEEN 50 AND 120 ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 6: Hidden PK table -- exercises next_row_id generation +--echo # ============================================ +--echo # + +BEGIN; +INSERT INTO stress_nopk VALUES (1, 'nopk_a'); +INSERT INTO stress_nopk VALUES (2, 'nopk_b'); +INSERT INTO stress_nopk VALUES (3, 'nopk_c'); +COMMIT; + +SELECT * FROM stress_nopk ORDER BY a; + +UPDATE stress_nopk SET b = 'updated' WHERE a = 2; +SELECT * FROM stress_nopk ORDER BY a; + +DELETE FROM stress_nopk WHERE a = 1; +SELECT COUNT(*) AS cnt FROM stress_nopk; + +--echo # +--echo # ============================================ +--echo # TEST 7: Large batch insert -- memtable pressure +--echo # Exercises: write_buffer flush, iterator over many keys. +--echo # ============================================ +--echo # + +--disable_query_log +let $i= 100; +while ($i <= 599) +{ + eval INSERT INTO stress_main VALUES ($i, CONCAT('batch_', $i), $i MOD 50); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS cnt FROM stress_main; +SELECT COUNT(*) AS high_score FROM stress_main WHERE score >= 40; + +--echo # +--echo # ============================================ +--echo # TEST 8: Large batch in single transaction +--echo # Exercises: many writes buffered in one txn, single commit. +--echo # ============================================ +--echo # + +BEGIN; +--disable_query_log +let $i= 1000; +while ($i <= 1499) +{ + eval INSERT INTO stress_wide VALUES ($i, CONCAT('c1_', $i), CONCAT('c2_', $i), $i MOD 100, $i * 10, $i + 0.50, '2025-01-01'); + inc $i; +} +--enable_query_log +COMMIT; + +SELECT COUNT(*) AS cnt FROM stress_wide; +SELECT COUNT(*) AS idx_match FROM stress_wide WHERE c3 = 50; +SELECT COUNT(*) AS idx_range FROM stress_wide WHERE c4 BETWEEN 10000 AND 10100; + +--echo # +--echo # ============================================ +--echo # TEST 9: Bulk UPDATE + DELETE in transaction +--echo # Exercises: update_row and delete_row across many rows, +--echo # secondary index maintenance (old key delete + new key insert). +--echo # ============================================ +--echo # + +BEGIN; +UPDATE stress_wide SET c3 = c3 + 200 WHERE c3 < 10; +DELETE FROM stress_wide WHERE c4 > 14000; +COMMIT; + +SELECT COUNT(*) AS cnt FROM stress_wide; +SELECT MIN(c3) AS min_c3, MAX(c3) AS max_c3 FROM stress_wide; + +--echo # +--echo # ============================================ +--echo # TEST 10: TRUNCATE -- exercises delete_all_rows +--echo # Exercises: txn rollback+free before CF drop, CF recreate, +--echo # share->cf pointer update. +--echo # ============================================ +--echo # + +SELECT COUNT(*) AS before_trunc FROM stress_wide; +TRUNCATE TABLE stress_wide; +SELECT COUNT(*) AS after_trunc FROM stress_wide; + +# Re-insert after truncate to verify CF is usable +INSERT INTO stress_wide VALUES (1, 'post_trunc', 'ok', 1, 1, 1.00, '2025-06-01'); +SELECT * FROM stress_wide; + +--echo # +--echo # ============================================ +--echo # TEST 11: Concurrent readers and writers +--echo # Exercises: multiple connections with overlapping transactions, +--echo # lock-free MVCC concurrency, separate per-connection txns. +--echo # ============================================ +--echo # + +# Seed data +DELETE FROM stress_main WHERE id >= 100; +SELECT COUNT(*) AS base_cnt FROM stress_main; + +connect (writer1, localhost, root,,); +connect (writer2, localhost, root,,); +connect (reader1, localhost, root,,); + +# Writer1: begin a long transaction +connection writer1; +BEGIN; +send INSERT INTO stress_main VALUES (1001, 'w1_a', 11); + +connection writer2; +# Writer2: concurrent inserts (autocommit) +send INSERT INTO stress_main VALUES (2001, 'w2_a', 22); + +# Reap both +connection writer1; +reap; +send INSERT INTO stress_main VALUES (1002, 'w1_b', 12); + +connection writer2; +reap; +send INSERT INTO stress_main VALUES (2002, 'w2_b', 23); + +connection writer1; +reap; + +connection writer2; +reap; + +# Reader1: read after both autocommit writers finished +# writer1 txn is still open (uncommitted), writer2 rows are committed +connection reader1; +SELECT COUNT(*) AS reader_sees FROM stress_main; + +# Writer1: commit the transaction +connection writer1; +COMMIT; + +# Writer2: one more insert + verify +connection writer2; +INSERT INTO stress_main VALUES (2003, 'w2_c', 24); + +# Final read from default connection +connection default; +SELECT COUNT(*) AS final_cnt FROM stress_main WHERE id >= 1000; + +disconnect writer1; +disconnect writer2; +disconnect reader1; + +--echo # +--echo # ============================================ +--echo # TEST 12: Concurrent transactions with rollback +--echo # Exercises: one connection commits, another rolls back. +--echo # ============================================ +--echo # + +connect (conn_commit, localhost, root,,); +connect (conn_rollback, localhost, root,,); + +connection conn_commit; +BEGIN; +INSERT INTO stress_main VALUES (3001, 'will_commit', 31); + +connection conn_rollback; +BEGIN; +INSERT INTO stress_main VALUES (4001, 'will_rollback', 41); + +# Interleave more operations +connection conn_commit; +INSERT INTO stress_main VALUES (3002, 'will_commit_2', 32); + +connection conn_rollback; +INSERT INTO stress_main VALUES (4002, 'will_rollback_2', 42); + +# Commit one, rollback the other +connection conn_commit; +COMMIT; + +connection conn_rollback; +ROLLBACK; + +connection default; +# Only 3001,3002 should exist; 4001,4002 should not +SELECT id, val FROM stress_main WHERE id IN (3001, 3002, 4001, 4002) ORDER BY id; + +disconnect conn_commit; +disconnect conn_rollback; + +--echo # +--echo # ============================================ +--echo # TEST 13: Rapid open/close cycle -- exercises close() cleanup +--echo # Multiple short-lived connections each doing a quick operation. +--echo # ============================================ +--echo # + +connect (rapid1, localhost, root,,); +connection rapid1; +SELECT COUNT(*) > 0 AS has_rows FROM stress_main; +disconnect rapid1; + +connect (rapid2, localhost, root,,); +connection rapid2; +INSERT INTO stress_main VALUES (5001, 'rapid', 50); +disconnect rapid2; + +connect (rapid3, localhost, root,,); +connection rapid3; +BEGIN; +INSERT INTO stress_main VALUES (5002, 'rapid_txn', 51); +COMMIT; +disconnect rapid3; + +connection default; +SELECT COUNT(*) AS rapid_cnt FROM stress_main WHERE id IN (5001, 5002); + +--echo # +--echo # ============================================ +--echo # TEST 14: INSERT...SELECT across TidesDB tables in transaction +--echo # Exercises: read from one CF + write to another in same txn. +--echo # ============================================ +--echo # + +TRUNCATE TABLE stress_wide; + +BEGIN; +INSERT INTO stress_wide (id, c1, c2, c3, c4, c5, c6) + SELECT id, val, val, score, score * 10, score + 0.50, '2025-01-01' + FROM stress_main + WHERE id <= 8; +COMMIT; + +SELECT COUNT(*) AS copied FROM stress_wide; +SELECT * FROM stress_wide ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 15: UPDATE that changes secondary index key +--echo # Exercises: sec index delete(old) + insert(new) in update_row. +--echo # ============================================ +--echo # + +# Before: score values +SELECT id, score FROM stress_main WHERE id <= 5 ORDER BY id; + +BEGIN; +UPDATE stress_main SET score = score + 1000 WHERE id <= 5; +COMMIT; + +# After: verify new index values are reachable +SELECT id, score FROM stress_main WHERE score >= 1000 ORDER BY id; + +# Restore +BEGIN; +UPDATE stress_main SET score = score - 1000 WHERE id <= 5; +COMMIT; + +SELECT id, score FROM stress_main WHERE id <= 5 ORDER BY id; + +--echo # +--echo # ============================================ +--echo # TEST 16: Concurrent bulk writers + reader +--echo # Exercises: heavy concurrent write pressure from multiple +--echo # connections, verifies no data corruption. +--echo # ============================================ +--echo # + +CREATE TABLE stress_bulk (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; + +connect (bulk1, localhost, root,,); +connect (bulk2, localhost, root,,); +connect (bulk3, localhost, root,,); + +connection bulk1; +send BEGIN; + +connection bulk2; +send BEGIN; + +connection bulk1; +reap; + +connection bulk2; +reap; + +--disable_query_log + +# Bulk1: insert 1-100 +connection bulk1; +let $i= 1; +while ($i <= 100) +{ + eval INSERT INTO stress_bulk VALUES ($i, CONCAT('b1_', $i)); + inc $i; +} + +# Bulk2: insert 101-200 +connection bulk2; +let $i= 101; +while ($i <= 200) +{ + eval INSERT INTO stress_bulk VALUES ($i, CONCAT('b2_', $i)); + inc $i; +} + +--enable_query_log + +# Commit both +connection bulk1; +send COMMIT; + +connection bulk2; +send COMMIT; + +connection bulk1; +reap; + +connection bulk2; +reap; + +# Bulk3: read while data settles +connection bulk3; +SELECT COUNT(*) AS bulk_total FROM stress_bulk; + +# Verify no gaps +SELECT COUNT(DISTINCT id) AS unique_ids FROM stress_bulk; + +connection default; +disconnect bulk1; +disconnect bulk2; +disconnect bulk3; + +DROP TABLE stress_bulk; + +--echo # +--echo # ============================================ +--echo # TEST 17: Repeated TRUNCATE + re-insert cycle +--echo # Exercises: repeated CF drop/recreate, share->cf pointer +--echo # update, txn discard before drop. +--echo # ============================================ +--echo # + +CREATE TABLE stress_trunc (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; + +let $round= 1; +while ($round <= 5) +{ + --disable_query_log + eval INSERT INTO stress_trunc VALUES ($round, CONCAT('round_', $round)); + eval INSERT INTO stress_trunc VALUES ($round + 10, CONCAT('round_', $round, '_b')); + --enable_query_log + TRUNCATE TABLE stress_trunc; + inc $round; +} + +SELECT COUNT(*) AS after_cycles FROM stress_trunc; + +# Final insert after repeated truncation +INSERT INTO stress_trunc VALUES (1, 'final'); +SELECT * FROM stress_trunc; + +DROP TABLE stress_trunc; + +--echo # +--echo # ============================================ +--echo # TEST 18: Transaction with only reads (read-only txn path) +--echo # Exercises: tidesdb_commit with dirty=false, rollback+reset path. +--echo # ============================================ +--echo # + +BEGIN; +SELECT COUNT(*) AS ro_cnt FROM stress_main; +SELECT * FROM stress_main WHERE id = 1; +SELECT MIN(score) AS min_s, MAX(score) AS max_s FROM stress_main; +COMMIT; + +--echo # +--echo # ============================================ +--echo # TEST 19: PK uniqueness enforcement and REPLACE INTO +--echo # Duplicate PK INSERT must return an error. +--echo # REPLACE INTO overwrites the existing row. +--echo # ============================================ +--echo # + +CREATE TABLE stress_uniq (id INT PRIMARY KEY, val VARCHAR(50)) ENGINE=TIDESDB; +INSERT INTO stress_uniq VALUES (1, 'first'); + +# Duplicate PK INSERT must fail +--error ER_DUP_ENTRY +INSERT INTO stress_uniq VALUES (1, 'should_fail'); + +# REPLACE INTO should overwrite +REPLACE INTO stress_uniq VALUES (1, 'replaced'); + +BEGIN; +INSERT INTO stress_uniq VALUES (2, 'second'); +REPLACE INTO stress_uniq VALUES (1, 'overwritten'); +INSERT INTO stress_uniq VALUES (3, 'third'); +COMMIT; + +# id=1 should have the overwritten value +SELECT * FROM stress_uniq ORDER BY id; + +DROP TABLE stress_uniq; + +--echo # +--echo # ============================================ +--echo # TEST 20: Verify data integrity after all stress +--echo # Final consistency check on the main table. +--echo # ============================================ +--echo # + +# Verify primary key scan +SELECT COUNT(*) AS total FROM stress_main; +# Verify index scan matches +SELECT COUNT(*) AS idx_total FROM stress_main WHERE score >= 0 OR score < 0 OR score IS NULL; + +--echo # +--echo # === Cleanup === +--echo # + +DROP TABLE stress_main; +DROP TABLE stress_nopk; +DROP TABLE stress_wide; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_tombstone_density.test b/mysql-test/suite/tidesdb/t/tidesdb_tombstone_density.test new file mode 100644 index 0000000000000..dde3eb5b61d36 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_tombstone_density.test @@ -0,0 +1,134 @@ +--source include/have_tidesdb.inc +# +# Coverage for the new TidesDB 9.1 capabilities exposed in TideSQL 4.4.0: +# - per-table TOMBSTONE_DENSITY_TRIGGER and TOMBSTONE_DENSITY_MIN_ENTRIES +# - tidesdb_default_tombstone_density_* THDVAR defaults +# - tidesdb_compact_after_range_delete_min_rows auto-trigger session variable +# - tombstone aggregates surfaced as global status variables +# + +--echo # +--echo # === Table-level tombstone density options accept and persist === +--echo # + +CREATE TABLE t_td ( + pk BIGINT PRIMARY KEY, + c0 INT, + KEY (c0) +) ENGINE=TIDESDB TOMBSTONE_DENSITY_TRIGGER=5000 TOMBSTONE_DENSITY_MIN_ENTRIES=512; + +# CREATE_OPTIONS prefixes each option with backticks; LOCATE the +# option name and the literal '=NNNN' value separately so we don't +# depend on default-bearing options shifting in surrounding text. +SELECT LOCATE('TOMBSTONE_DENSITY_TRIGGER', CREATE_OPTIONS) > 0 AS has_trigger + FROM information_schema.TABLES + WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; +SELECT LOCATE('=5000', CREATE_OPTIONS) > 0 AS trigger_value + FROM information_schema.TABLES + WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; +SELECT LOCATE('=512', CREATE_OPTIONS) > 0 AS min_entries_value + FROM information_schema.TABLES + WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; + +ALTER TABLE t_td TOMBSTONE_DENSITY_TRIGGER=2000; +SELECT LOCATE('=2000', CREATE_OPTIONS) > 0 AS new_value + FROM information_schema.TABLES + WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_td'; + +DROP TABLE t_td; + +--echo # +--echo # === Session-default inheritance === +--echo # + +SET SESSION tidesdb_default_tombstone_density_trigger = 4000; +SET SESSION tidesdb_default_tombstone_density_min_entries = 256; + +CREATE TABLE t_default_td (pk BIGINT PRIMARY KEY, c0 INT) ENGINE=TIDESDB; +SELECT LOCATE('=4000', CREATE_OPTIONS) > 0 AS inherits_trigger + FROM information_schema.TABLES + WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_default_td'; +SELECT LOCATE('=256', CREATE_OPTIONS) > 0 AS inherits_min + FROM information_schema.TABLES + WHERE TABLE_SCHEMA='test' AND TABLE_NAME='t_default_td'; + +DROP TABLE t_default_td; + +SET SESSION tidesdb_default_tombstone_density_trigger = DEFAULT; +SET SESSION tidesdb_default_tombstone_density_min_entries = DEFAULT; + +--echo # +--echo # === Auto compact-after-range-delete session variable === +--echo # + +SHOW VARIABLES LIKE 'tidesdb_compact_after_range_delete_min_rows'; + +CREATE TABLE t_auto ( + pk BIGINT PRIMARY KEY, + c0 INT, + c1 INT, + KEY (c0), + KEY (c1) +) ENGINE=TIDESDB; + +# Seed 100 rows with two 50-row VALUES inserts (no SEQUENCE engine dependency). +INSERT INTO t_auto (pk,c0,c1) VALUES + (1,0,2),(2,1,4),(3,2,6),(4,3,8),(5,4,10), + (6,5,12),(7,6,14),(8,7,16),(9,8,18),(10,9,20), + (11,0,22),(12,1,24),(13,2,26),(14,3,28),(15,4,30), + (16,5,32),(17,6,34),(18,7,36),(19,8,38),(20,9,40), + (21,0,42),(22,1,44),(23,2,46),(24,3,48),(25,4,50), + (26,5,52),(27,6,54),(28,7,56),(29,8,58),(30,9,60), + (31,0,62),(32,1,64),(33,2,66),(34,3,68),(35,4,70), + (36,5,72),(37,6,74),(38,7,76),(39,8,78),(40,9,80), + (41,0,82),(42,1,84),(43,2,86),(44,3,88),(45,4,90), + (46,5,92),(47,6,94),(48,7,96),(49,8,98),(50,9,100); +INSERT INTO t_auto (pk,c0,c1) VALUES + (51,0,102),(52,1,104),(53,2,106),(54,3,108),(55,4,110), + (56,5,112),(57,6,114),(58,7,116),(59,8,118),(60,9,120), + (61,0,122),(62,1,124),(63,2,126),(64,3,128),(65,4,130), + (66,5,132),(67,6,134),(68,7,136),(69,8,138),(70,9,140), + (71,0,142),(72,1,144),(73,2,146),(74,3,148),(75,4,150), + (76,5,152),(77,6,154),(78,7,156),(79,8,158),(80,9,160), + (81,0,162),(82,1,164),(83,2,166),(84,3,168),(85,4,170), + (86,5,172),(87,6,174),(88,7,176),(89,8,178),(90,9,180), + (91,0,182),(92,1,184),(93,2,186),(94,3,188),(95,4,190), + (96,5,192),(97,6,194),(98,7,196),(99,8,198),(100,9,200); + +SELECT COUNT(*) FROM t_auto; + +--echo # threshold below the deleted-row count, auto compact fires silently. +--echo # We assert reads remain correct after the synchronous compaction. +SET SESSION tidesdb_compact_after_range_delete_min_rows = 20; +DELETE FROM t_auto WHERE pk BETWEEN 30 AND 70; +SELECT COUNT(*) FROM t_auto; +SELECT pk FROM t_auto WHERE pk BETWEEN 28 AND 32 ORDER BY pk; +SELECT pk FROM t_auto WHERE pk BETWEEN 68 AND 72 ORDER BY pk; +SELECT pk FROM t_auto WHERE c0 = 5 AND pk < 70 ORDER BY pk; +SELECT pk FROM t_auto WHERE c1 = 134; + +--echo # threshold above the deleted-row count, auto compact does NOT fire. +SET SESSION tidesdb_compact_after_range_delete_min_rows = 1000000; +DELETE FROM t_auto WHERE pk BETWEEN 75 AND 79; +SELECT COUNT(*) FROM t_auto; +SELECT pk FROM t_auto WHERE pk BETWEEN 73 AND 81 ORDER BY pk; + +SET SESSION tidesdb_compact_after_range_delete_min_rows = DEFAULT; + +DROP TABLE t_auto; + +--echo # +--echo # === Tombstone status variables exist and are non-negative === +--echo # + +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS total + FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_TOTAL_TOMBSTONES'; +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS ratio + FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_TOMBSTONE_RATIO'; +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS density + FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_MAX_SST_TOMBSTONE_DENSITY'; +SELECT IF(VARIABLE_VALUE >= 0, 'ok', 'bad') AS density_level + FROM information_schema.GLOBAL_STATUS WHERE VARIABLE_NAME='TIDESDB_MAX_SST_TOMBSTONE_DENSITY_LEVEL'; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_tpcc_contention.opt b/mysql-test/suite/tidesdb/t/tidesdb_tpcc_contention.opt new file mode 100644 index 0000000000000..917be657b5ded --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_tpcc_contention.opt @@ -0,0 +1 @@ +--loose-tidesdb-pessimistic-locking=ON diff --git a/mysql-test/suite/tidesdb/t/tidesdb_tpcc_contention.test b/mysql-test/suite/tidesdb/t/tidesdb_tpcc_contention.test new file mode 100644 index 0000000000000..20466d6d0c864 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_tpcc_contention.test @@ -0,0 +1,164 @@ +--source include/have_tidesdb.inc +# +# TidesDB TPC-C contention test -- reproduces the exact NEWORD district +# counter read-modify-write pattern that causes 0 NOPM in HammerDB. +# +# The district row is a serial bottleneck: every New Order transaction +# must SELECT d_next_o_id FOR UPDATE, then UPDATE d_next_o_id + 1. +# With InnoDB this serializes via row locks. With TidesDB's optimistic +# MVCC, two concurrent transactions both read the same value, both +# write the incremented value, and the second to commit fails with +# TDB_ERR_CONFLICT (mapped to ER_LOCK_DEADLOCK / ER_ERROR_DURING_COMMIT). +# +# This test verifies that concurrent counter increments produce correct +# results without lost updates or permanent failures. +# + +--echo # +--echo # === Setup: TPC-C district table (simplified) === +--echo # + +CREATE TABLE district ( + d_w_id INT NOT NULL, + d_id INT NOT NULL, + d_next_o_id INT NOT NULL, + d_tax DECIMAL(4,4), + PRIMARY KEY (d_w_id, d_id) +) ENGINE=TIDESDB; + +INSERT INTO district VALUES (1, 1, 3001, 0.1000); + +CREATE TABLE orders ( + o_id INT NOT NULL, + o_w_id INT NOT NULL, + o_d_id INT NOT NULL, + o_c_id INT NOT NULL, + PRIMARY KEY (o_w_id, o_d_id, o_id) +) ENGINE=TIDESDB; + +CREATE TABLE new_order ( + no_w_id INT NOT NULL, + no_d_id INT NOT NULL, + no_o_id INT NOT NULL, + PRIMARY KEY (no_w_id, no_d_id, no_o_id) +) ENGINE=TIDESDB; + +--echo # +--echo # === TEST 1: Single-session NEWORD (baseline) === +--echo # + +BEGIN; +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1 FOR UPDATE; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +INSERT INTO orders VALUES (3001, 1, 1, 42); +INSERT INTO new_order VALUES (1, 1, 3001); +COMMIT; + +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # === TEST 2: Two concurrent UPDATEs on same district row === +--echo # With pessimistic_locking=ON, the second UPDATE blocks on the +--echo # row lock until the first commits. Both succeed, counter +--echo # increments by 2 with no conflicts and no lost updates. +--echo # + +connect (connA, localhost, root,,); +connect (connB, localhost, root,,); + +# Connection A: UPDATE district (acquires row lock, held until COMMIT) +connection connA; +BEGIN; +UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; + +# Connection B: send UPDATE async -- will block on row lock until A commits +connection connB; +send UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; + +# Connection A: commit -- releases row lock, unblocks B +connection connA; +COMMIT; + +# Connection B: reap -- should succeed now that A released the lock +connection connB; +reap; + +# Check results +connection default; +--echo # Both UPDATEs succeeded: 3002 + 2 = 3004 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # === TEST 3: Serial counter increment (10 iterations) === +--echo # Verify the counter works correctly when serialized. +--echo # + +--disable_query_log +let $i= 0; +while ($i < 10) +{ + BEGIN; + eval UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; + COMMIT; + inc $i; +} +--enable_query_log + +--echo # Should be initial(3004) + 10 = 3014 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # === TEST 4: 4 concurrent autocommit UPDATEs on same row === +--echo # With pessimistic_locking=ON, all 4 serialize through the row lock. +--echo # Counter should advance by exactly 4. +--echo # + +# Reset counter +UPDATE district SET d_next_o_id = 5001 WHERE d_w_id=1 AND d_id=1; + +connect (storm1, localhost, root,,); +connect (storm2, localhost, root,,); +connect (storm3, localhost, root,,); +connect (storm4, localhost, root,,); + +# Each connection does 5 serial increments (autocommit) +connection storm1; +send UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm2; +send UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm3; +send UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; +connection storm4; +send UPDATE district SET d_next_o_id = d_next_o_id + 1 WHERE d_w_id=1 AND d_id=1; + +connection storm1; +reap; +connection storm2; +reap; +connection storm3; +reap; +connection storm4; +reap; + +connection default; +--echo # All 4 UPDATEs succeeded through serialized row locks: 5001 + 4 = 5005 +SELECT d_next_o_id FROM district WHERE d_w_id=1 AND d_id=1; + +--echo # +--echo # === Cleanup === +--echo # + +disconnect connA; +disconnect connB; +disconnect storm1; +disconnect storm2; +disconnect storm3; +disconnect storm4; + +connection default; +DROP TABLE district; +DROP TABLE orders; +DROP TABLE new_order; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_ttl.test b/mysql-test/suite/tidesdb/t/tidesdb_ttl.test new file mode 100644 index 0000000000000..889c5d9ff8a18 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_ttl.test @@ -0,0 +1,241 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc + + +--echo # +--echo # ============================================ +--echo # TEST 1: Table-level TTL (short expiration) +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_table ( + id INT PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB TTL=8; + +INSERT INTO t_ttl_table VALUES (1, 'alpha'), (2, 'beta'), (3, 'gamma'); + +--echo # Rows should be visible immediately +SELECT * FROM t_ttl_table ORDER BY id; + +--echo # Wait for TTL to expire (3 seconds > 2 second TTL) +--sleep 10 + +--echo # Rows should now be expired (empty result) +SELECT * FROM t_ttl_table ORDER BY id; + +DROP TABLE t_ttl_table; + +--echo # +--echo # ============================================ +--echo # TEST 2: Per-row TTL via TTL_COL field option +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_col ( + id INT PRIMARY KEY, + val VARCHAR(50), + expire_secs INT `TTL`=1 +) ENGINE=TIDESDB; + +# Row 1: 2-second TTL, Row 2: very long TTL, Row 3: 0 = no expiration +INSERT INTO t_ttl_col VALUES (1, 'short', 8), (2, 'long', 86400), (3, 'forever', 0); + +--echo # All three rows visible immediately +SELECT id, val FROM t_ttl_col ORDER BY id; + +--echo # Wait for the short TTL to expire +--sleep 10 + +--echo # Row 1 should be expired; rows 2 and 3 remain +SELECT id, val FROM t_ttl_col ORDER BY id; + +DROP TABLE t_ttl_col; + +--echo # +--echo # ============================================ +--echo # TEST 3: Per-row TTL overrides table default +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_override ( + id INT PRIMARY KEY, + val VARCHAR(50), + ttl_val INT `TTL`=1 +) ENGINE=TIDESDB TTL=86400; + +# Row 1: per-row TTL=2 overrides table default 86400 +# Row 2: per-row TTL=0 falls back to table default 86400 +INSERT INTO t_ttl_override VALUES (1, 'short_override', 8), (2, 'uses_default', 0); + +--echo # Both rows visible immediately +SELECT id, val FROM t_ttl_override ORDER BY id; + +--sleep 10 + +--echo # Row 1 expired (per-row TTL=2 overrode default); row 2 still alive (table TTL=86400) +SELECT id, val FROM t_ttl_override ORDER BY id; + +DROP TABLE t_ttl_override; + +--echo # +--echo # ============================================ +--echo # TEST 4: TTL=0 means no expiration (default) +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_none ( + id INT PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB TTL=0; + +INSERT INTO t_ttl_none VALUES (1, 'permanent'); + +--sleep 2 + +--echo # Row should still be present (TTL=0 = no expiration) +SELECT * FROM t_ttl_none ORDER BY id; + +DROP TABLE t_ttl_none; + +--echo # +--echo # ============================================ +--echo # TEST 5: TTL with UPDATE refreshes expiration +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_update ( + id INT PRIMARY KEY, + val VARCHAR(50), + ttl_s INT `TTL`=1 +) ENGINE=TIDESDB; + +INSERT INTO t_ttl_update VALUES (1, 'original', 8); + +--echo # Row visible immediately +SELECT id, val FROM t_ttl_update ORDER BY id; + +--sleep 1 + +--echo # UPDATE resets TTL to 5 more seconds +UPDATE t_ttl_update SET val = 'refreshed', ttl_s = 30 WHERE id = 1; + +--sleep 2 + +--echo # Row should still be alive (UPDATE refreshed TTL at ~1s, now at ~3s, TTL=5s) +SELECT id, val FROM t_ttl_update ORDER BY id; + +DROP TABLE t_ttl_update; + +--echo # +--echo # ============================================ +--echo # TEST 6: SHOW CREATE TABLE shows TTL options +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_show ( + id INT PRIMARY KEY, + val VARCHAR(50), + row_ttl INT `TTL`=1 +) ENGINE=TIDESDB TTL=3600; + +SHOW CREATE TABLE t_ttl_show; + +DROP TABLE t_ttl_show; + +--echo # +--echo # ============================================ +--echo # TEST 7: Session TTL override (SET SESSION) +--echo # Table has no TTL; session variable applies +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_sess ( + id INT PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB; + +--echo # Default session TTL is 0 (no override) +SELECT @@session.tidesdb_ttl; + +SET SESSION tidesdb_ttl = 8; + +INSERT INTO t_ttl_sess VALUES (1, 'session_ttl'), (2, 'also_session'); + +--echo # Rows visible immediately +SELECT * FROM t_ttl_sess ORDER BY id; + +SET SESSION tidesdb_ttl = 0; + +--echo # Wait for session TTL to expire (3s > 2s) +--sleep 10 + +--echo # Rows should now be expired +SELECT * FROM t_ttl_sess ORDER BY id; + +DROP TABLE t_ttl_sess; + +--echo # +--echo # ============================================ +--echo # TEST 8: SET STATEMENT tidesdb_ttl=N FOR ... +--echo # Only the single statement gets TTL +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_stmt ( + id INT PRIMARY KEY, + val VARCHAR(50) +) ENGINE=TIDESDB; + +# Row 1: inserted with 2-second TTL via SET STATEMENT +SET STATEMENT tidesdb_ttl=8 FOR + INSERT INTO t_ttl_stmt VALUES (1, 'short_lived'); + +# Row 2: inserted with default (no TTL) +INSERT INTO t_ttl_stmt VALUES (2, 'permanent'); + +--echo # Both rows visible immediately +SELECT * FROM t_ttl_stmt ORDER BY id; + +--sleep 10 + +--echo # Row 1 expired (session TTL=2); row 2 still alive (no TTL) +SELECT * FROM t_ttl_stmt ORDER BY id; + +DROP TABLE t_ttl_stmt; + +--echo # +--echo # ============================================ +--echo # TEST 9: Session TTL does NOT override per-row TTL_COL +--echo # ============================================ +--echo # + +CREATE TABLE t_ttl_priority ( + id INT PRIMARY KEY, + val VARCHAR(50), + row_ttl INT `TTL`=1 +) ENGINE=TIDESDB; + +SET SESSION tidesdb_ttl = 86400; + +# Row 1: per-row TTL=2 should win over session TTL=86400 +INSERT INTO t_ttl_priority VALUES (1, 'per_row_wins', 8); +# Row 2: per-row TTL=0 falls back to session TTL=86400 +INSERT INTO t_ttl_priority VALUES (2, 'uses_session', 0); + +SET SESSION tidesdb_ttl = 0; + +--echo # Both visible immediately +SELECT id, val FROM t_ttl_priority ORDER BY id; + +--sleep 10 + +--echo # Row 1 expired (per-row TTL=2 wins); row 2 still alive (session TTL=86400) +SELECT id, val FROM t_ttl_priority ORDER BY id; + +DROP TABLE t_ttl_priority; + +--echo # +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_unified_memtable.test b/mysql-test/suite/tidesdb/t/tidesdb_unified_memtable.test new file mode 100644 index 0000000000000..79c2ed6215f49 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_unified_memtable.test @@ -0,0 +1,101 @@ +--source include/have_tidesdb.inc +# +# Test: Unified memtable mode behavior +# This test runs with the server's default unified_memtable=ON. +# It exercises operations that stress the shared WAL and memtable: +# concurrent table access, cross-CF consistency, and flush behavior. +# + +--echo # +--echo # TEST 1: Verify unified memtable is ON +--echo # + +SELECT @@tidesdb_unified_memtable AS unified; + +--echo # +--echo # TEST 2: Multiple tables sharing the unified memtable +--echo # + +CREATE TABLE t_um1 (id INT PRIMARY KEY, v VARCHAR(100)) ENGINE=TidesDB; +CREATE TABLE t_um2 (id INT PRIMARY KEY, v VARCHAR(100)) ENGINE=TidesDB; +CREATE TABLE t_um3 (id INT PRIMARY KEY, v VARCHAR(100)) ENGINE=TidesDB; + +BEGIN; +INSERT INTO t_um1 VALUES (1, 'table1_row1'); +INSERT INTO t_um2 VALUES (1, 'table2_row1'); +INSERT INTO t_um3 VALUES (1, 'table3_row1'); +COMMIT; + +SELECT * FROM t_um1; +SELECT * FROM t_um2; +SELECT * FROM t_um3; + +--echo # +--echo # TEST 3: Cross-table transaction atomicity +--echo # + +BEGIN; +INSERT INTO t_um1 VALUES (2, 'committed'); +INSERT INTO t_um2 VALUES (2, 'committed'); +INSERT INTO t_um3 VALUES (2, 'committed'); +COMMIT; + +BEGIN; +INSERT INTO t_um1 VALUES (3, 'rolled_back'); +INSERT INTO t_um2 VALUES (3, 'rolled_back'); +ROLLBACK; + +SELECT COUNT(*) AS t1_rows FROM t_um1; +SELECT COUNT(*) AS t2_rows FROM t_um2; +SELECT COUNT(*) AS t3_rows FROM t_um3; + +--echo # +--echo # TEST 4: Bulk write across tables (stresses unified WAL) +--echo # + +--disable_query_log +let $i = 10; +while ($i <= 50) +{ + eval INSERT INTO t_um1 VALUES ($i, REPEAT('A', 50)); + eval INSERT INTO t_um2 VALUES ($i, REPEAT('B', 50)); + inc $i; +} +--enable_query_log + +SELECT COUNT(*) AS t1_total FROM t_um1; +SELECT COUNT(*) AS t2_total FROM t_um2; + +--echo # +--echo # TEST 5: OPTIMIZE TABLE with unified memtable +--echo # + +OPTIMIZE TABLE t_um1; +OPTIMIZE TABLE t_um2; + +SELECT COUNT(*) AS after_optimize FROM t_um1; + +--echo # +--echo # TEST 6: Secondary indexes across multiple CFs in unified mode +--echo # + +CREATE TABLE t_um_idx ( + id INT PRIMARY KEY, + a INT, + b INT, + KEY(a), + KEY(b) +) ENGINE=TidesDB; + +INSERT INTO t_um_idx VALUES (1, 10, 100), (2, 20, 200), (3, 10, 300); +SELECT id FROM t_um_idx WHERE a = 10 ORDER BY id; +SELECT id FROM t_um_idx WHERE b = 200; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE t_um1, t_um2, t_um3, t_um_idx; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_update_unique.test b/mysql-test/suite/tidesdb/t/tidesdb_update_unique.test new file mode 100644 index 0000000000000..e819489f11a18 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_update_unique.test @@ -0,0 +1,52 @@ +--source include/have_tidesdb.inc +# +# UPDATE must enforce PRIMARY KEY and UNIQUE secondary-index uniqueness. +# A TidesDB put overwrites silently, so without an explicit pre-check an +# UPDATE that moves a row onto an existing key would destroy the colliding +# row (primary key) or create a duplicate (unique secondary index). +# + +--echo # --- PRIMARY KEY collision --- +CREATE TABLE t1 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t1 VALUES (1,10),(2,20); +--error ER_DUP_ENTRY +UPDATE t1 SET id=2 WHERE id=1; +--echo # Both rows must survive the rejected UPDATE +SELECT * FROM t1 ORDER BY id; +--echo # A non-colliding move still succeeds +UPDATE t1 SET id=3 WHERE id=1; +SELECT * FROM t1 ORDER BY id; +DROP TABLE t1; + +--echo # --- UNIQUE secondary collision --- +CREATE TABLE t2 (id INT PRIMARY KEY, e VARCHAR(20), v INT, UNIQUE KEY(e)) ENGINE=TidesDB; +INSERT INTO t2 VALUES (1,'a',10),(2,'b',20); +--error ER_DUP_ENTRY +UPDATE t2 SET e='b' WHERE id=1; +--echo # No duplicate 'b' may exist after the rejected UPDATE +SELECT * FROM t2 ORDER BY id; +--echo # Updating the unique column to a fresh value succeeds +UPDATE t2 SET e='c' WHERE id=1; +SELECT * FROM t2 ORDER BY id; +--echo # Updating a non-indexed column leaves the unique value in place +UPDATE t2 SET v=99 WHERE id=1; +SELECT * FROM t2 ORDER BY id; +DROP TABLE t2; + +--echo # --- changing only the PK keeps a stable unique value valid --- +CREATE TABLE t3 (id INT PRIMARY KEY, e VARCHAR(20), UNIQUE KEY(e)) ENGINE=TidesDB; +INSERT INTO t3 VALUES (1,'x'),(2,'y'); +--echo # moving id 1 to 3 keeps e='x' unique to that row, must succeed +UPDATE t3 SET id=3 WHERE id=1; +SELECT * FROM t3 ORDER BY id; +DROP TABLE t3; + +--echo # --- tidesdb_skip_unique_check bypasses enforcement by contract --- +CREATE TABLE t4 (id INT PRIMARY KEY, v INT) ENGINE=TidesDB; +INSERT INTO t4 VALUES (1,10),(2,20); +SET SESSION tidesdb_skip_unique_check=1; +UPDATE t4 SET id=2 WHERE id=1; +SET SESSION tidesdb_skip_unique_check=DEFAULT; +DROP TABLE t4; + +--source suite/tidesdb/include/cleanup_tidesdb.inc diff --git a/mysql-test/suite/tidesdb/t/tidesdb_vcol.test b/mysql-test/suite/tidesdb/t/tidesdb_vcol.test new file mode 100644 index 0000000000000..9834fabdd0b32 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_vcol.test @@ -0,0 +1,197 @@ +--source include/have_tidesdb.inc +--source include/not_embedded.inc + + +--echo # +--echo # ============================================ +--echo # TEST 1: VIRTUAL generated column +--echo # ============================================ +--echo # + +CREATE TABLE t_vcol ( + id INT PRIMARY KEY, + price DECIMAL(10,2), + qty INT, + total DECIMAL(10,2) AS (price * qty) VIRTUAL +) ENGINE=TIDESDB; + +INSERT INTO t_vcol (id, price, qty) VALUES (1, 10.50, 3); +INSERT INTO t_vcol (id, price, qty) VALUES (2, 25.00, 2); +INSERT INTO t_vcol (id, price, qty) VALUES (3, 5.75, 10); + +--echo # Virtual column 'total' should be computed on read +SELECT * FROM t_vcol ORDER BY id; + +--echo # Update base column and verify virtual column recalculates +UPDATE t_vcol SET qty = 5 WHERE id = 1; +SELECT id, price, qty, total FROM t_vcol WHERE id = 1; + +DROP TABLE t_vcol; + +--echo # +--echo # ============================================ +--echo # TEST 2: STORED (PERSISTENT) generated column +--echo # ============================================ +--echo # + +CREATE TABLE t_scol ( + id INT PRIMARY KEY, + first_name VARCHAR(50), + last_name VARCHAR(50), + full_name VARCHAR(101) AS (CONCAT(first_name, ' ', last_name)) PERSISTENT +) ENGINE=TIDESDB; + +INSERT INTO t_scol (id, first_name, last_name) VALUES (1, 'John', 'Doe'); +INSERT INTO t_scol (id, first_name, last_name) VALUES (2, 'Jane', 'Smith'); + +SELECT * FROM t_scol ORDER BY id; + +--echo # Update base column and verify stored column updates +UPDATE t_scol SET last_name = 'Johnson' WHERE id = 1; +SELECT * FROM t_scol WHERE id = 1; + +DROP TABLE t_scol; + +--echo # +--echo # ============================================ +--echo # TEST 3: Multiple virtual columns +--echo # ============================================ +--echo # + +CREATE TABLE t_multi_vcol ( + id INT PRIMARY KEY, + radius DOUBLE, + area DOUBLE AS (PI() * radius * radius) VIRTUAL, + circumference DOUBLE AS (2 * PI() * radius) VIRTUAL, + diameter DOUBLE AS (2 * radius) VIRTUAL +) ENGINE=TIDESDB; + +INSERT INTO t_multi_vcol (id, radius) VALUES (1, 5.0); +INSERT INTO t_multi_vcol (id, radius) VALUES (2, 10.0); + +SELECT id, radius, ROUND(area, 2) AS area, ROUND(circumference, 2) AS circ, diameter +FROM t_multi_vcol ORDER BY id; + +DROP TABLE t_multi_vcol; + +--echo # +--echo # ============================================ +--echo # TEST 4: Virtual column with conditional expression +--echo # ============================================ +--echo # + +CREATE TABLE t_vcol_cond ( + id INT PRIMARY KEY, + score INT, + grade VARCHAR(10) AS ( + CASE + WHEN score >= 90 THEN 'A' + WHEN score >= 80 THEN 'B' + WHEN score >= 70 THEN 'C' + WHEN score >= 60 THEN 'D' + ELSE 'F' + END + ) VIRTUAL +) ENGINE=TIDESDB; + +INSERT INTO t_vcol_cond (id, score) VALUES (1, 95), (2, 82), (3, 71), (4, 55); + +SELECT * FROM t_vcol_cond ORDER BY id; + +--echo # Update score and verify grade recalculates +UPDATE t_vcol_cond SET score = 91 WHERE id = 4; +SELECT * FROM t_vcol_cond WHERE id = 4; + +DROP TABLE t_vcol_cond; + +--echo # +--echo # ============================================ +--echo # TEST 5: Mixed virtual and stored columns +--echo # ============================================ +--echo # + +CREATE TABLE t_mixed ( + id INT PRIMARY KEY, + a INT, + b INT, + sum_ab INT AS (a + b) PERSISTENT, + product_ab INT AS (a * b) VIRTUAL, + diff_ab INT AS (a - b) VIRTUAL +) ENGINE=TIDESDB; + +INSERT INTO t_mixed (id, a, b) VALUES (1, 10, 3), (2, 7, 4), (3, 15, 8); + +SELECT * FROM t_mixed ORDER BY id; + +UPDATE t_mixed SET a = 20 WHERE id = 2; +SELECT * FROM t_mixed WHERE id = 2; + +DROP TABLE t_mixed; + +--echo # +--echo # ============================================ +--echo # TEST 6: Virtual column with string functions +--echo # ============================================ +--echo # + +CREATE TABLE t_vcol_str ( + id INT PRIMARY KEY, + email VARCHAR(100), + domain VARCHAR(100) AS (SUBSTRING_INDEX(email, '@', -1)) VIRTUAL, + username VARCHAR(100) AS (SUBSTRING_INDEX(email, '@', 1)) VIRTUAL +) ENGINE=TIDESDB; + +INSERT INTO t_vcol_str (id, email) VALUES + (1, 'alice@example.com'), + (2, 'bob@gmail.com'), + (3, 'charlie@company.org'); + +SELECT * FROM t_vcol_str ORDER BY id; + +--echo # Verify WHERE clause on virtual column works +SELECT id, email FROM t_vcol_str WHERE domain = 'gmail.com'; + +DROP TABLE t_vcol_str; + +--echo # +--echo # ============================================ +--echo # TEST 7: Virtual column with DELETE +--echo # ============================================ +--echo # + +CREATE TABLE t_vcol_del ( + id INT PRIMARY KEY, + val INT, + doubled INT AS (val * 2) VIRTUAL +) ENGINE=TIDESDB; + +INSERT INTO t_vcol_del (id, val) VALUES (1, 10), (2, 20), (3, 30); +SELECT * FROM t_vcol_del ORDER BY id; + +DELETE FROM t_vcol_del WHERE id = 2; +SELECT * FROM t_vcol_del ORDER BY id; + +DROP TABLE t_vcol_del; + +--echo # +--echo # ============================================ +--echo # TEST 8: SHOW CREATE TABLE with virtual columns +--echo # ============================================ +--echo # + +CREATE TABLE t_vcol_show ( + id INT PRIMARY KEY, + a INT, + b INT, + v_sum INT AS (a + b) VIRTUAL, + s_prod INT AS (a * b) PERSISTENT +) ENGINE=TIDESDB; + +SHOW CREATE TABLE t_vcol_show; + +DROP TABLE t_vcol_show; + +--echo # +--echo # +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_vector.test b/mysql-test/suite/tidesdb/t/tidesdb_vector.test new file mode 100644 index 0000000000000..e84f6050900ad --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_vector.test @@ -0,0 +1,104 @@ +--source include/have_tidesdb.inc +--source suite/tidesdb/include/have_tidesdb_vector.inc +# +# Test: Vector search (MHNSW approximate nearest neighbor) +# +# Covers: +# 1. CREATE TABLE with VECTOR index +# 2. INSERT vectors and build MHNSW graph +# 3. ANN search with Euclidean and cosine distance +# 4. UPDATE vector column +# 5. DELETE vector rows +# 6. UPDATE non-vector column +# + +--echo # +--echo # Setup +--echo # + +CREATE TABLE docs ( + id INT NOT NULL PRIMARY KEY, + title VARCHAR(100), + v VECTOR(4) NOT NULL, + VECTOR INDEX (v) +) ENGINE=TidesDB; + +INSERT INTO docs VALUES (1, 'origin-x', Vec_FromText('[1.0, 0.0, 0.0, 0.0]')); +INSERT INTO docs VALUES (2, 'origin-y', Vec_FromText('[0.0, 1.0, 0.0, 0.0]')); +INSERT INTO docs VALUES (3, 'origin-z', Vec_FromText('[0.0, 0.0, 1.0, 0.0]')); +INSERT INTO docs VALUES (4, 'near-x', Vec_FromText('[0.9, 0.1, 0.0, 0.0]')); +INSERT INTO docs VALUES (5, 'center', Vec_FromText('[0.5, 0.5, 0.5, 0.5]')); + +--echo # +--echo # TEST 1: Euclidean ANN search +--echo # + +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; + +--echo # +--echo # TEST 2: Cosine ANN search +--echo # + +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_COSINE(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; + +--echo # +--echo # TEST 3: UPDATE vector column +--echo # + +UPDATE docs SET v = Vec_FromText('[0.95, 0.05, 0.0, 0.0]') WHERE id = 4; + +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; + +--echo # +--echo # TEST 4: DELETE vector row +--echo # + +DELETE FROM docs WHERE id = 1; + +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 3; + +--echo # +--echo # TEST 5: UPDATE non-vector column +--echo # + +UPDATE docs SET title = 'renamed-near-x' WHERE id = 4; + +SELECT id, title FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[1.0, 0.0, 0.0, 0.0]')) +LIMIT 2; + +--echo # +--echo # TEST 6: Different dimensionality +--echo # + +DROP TABLE docs; +CREATE TABLE docs ( + id INT NOT NULL PRIMARY KEY, + v VECTOR(3) NOT NULL, + VECTOR INDEX (v) +) ENGINE=TidesDB; + +INSERT INTO docs VALUES (1, Vec_FromText('[1.0, 0.0, 0.0]')); +INSERT INTO docs VALUES (2, Vec_FromText('[0.0, 1.0, 0.0]')); +INSERT INTO docs VALUES (3, Vec_FromText('[0.0, 0.0, 1.0]')); + +SELECT id FROM docs +ORDER BY VEC_DISTANCE_EUCLIDEAN(v, Vec_FromText('[0.9, 0.1, 0.0]')) +LIMIT 2; + +--echo # +--echo # Cleanup +--echo # + +DROP TABLE docs; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/mysql-test/suite/tidesdb/t/tidesdb_write_pressure.opt b/mysql-test/suite/tidesdb/t/tidesdb_write_pressure.opt new file mode 100644 index 0000000000000..f9444912d5f69 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_write_pressure.opt @@ -0,0 +1,2 @@ +--tidesdb-pessimistic-locking=OFF +--tidesdb-unified-memtable-sync-mode=NONE diff --git a/mysql-test/suite/tidesdb/t/tidesdb_write_pressure.test b/mysql-test/suite/tidesdb/t/tidesdb_write_pressure.test new file mode 100644 index 0000000000000..016114de8d392 --- /dev/null +++ b/mysql-test/suite/tidesdb/t/tidesdb_write_pressure.test @@ -0,0 +1,442 @@ +--source include/have_tidesdb.inc +# +# TidesDB write-pressure stress test +# +# Reproduces the oltp_write_only OOM pattern observed at >=16 sysbench threads: +# - Sysbench-like schema with secondary index on k +# - SYNC_MODE='NONE' (no fsync, maximum write throughput) +# - Multiple connections doing concurrent write-only transactions +# - Each txn: 4 UPDATEs + 1 DELETE + 1 INSERT (matches sysbench oltp_write_only) +# - Conflicts expected (ERROR 1180 from optimistic CC) -- exercises retry path +# +# Under ASAN, any per-operation memory leak will be caught by LeakSanitizer +# at server shutdown. Under Valgrind, use --tool=massif for heap profiling. +# + +# Suppress expected conflict warnings from concurrent writes +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_CONFLICT"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_LOCKED"); +call mtr.add_suppression("\\[TIDESDB\\].*TDB_ERR_MEMORY_LIMIT"); +call mtr.add_suppression("\\[TIDESDB\\].*unexpected TidesDB error"); + +--echo # +--echo # === Setup: sysbench-like schema with SYNC_MODE=NONE === +--echo # + +CREATE TABLE sbtest1 ( + id INT NOT NULL AUTO_INCREMENT, + k INT NOT NULL DEFAULT 0, + c CHAR(120) NOT NULL DEFAULT '', + pad CHAR(60) NOT NULL DEFAULT '', + PRIMARY KEY (id), + KEY k_1 (k) +) ENGINE=TIDESDB SYNC_MODE='NONE'; + +CREATE TABLE sbtest2 ( + id INT NOT NULL AUTO_INCREMENT, + k INT NOT NULL DEFAULT 0, + c CHAR(120) NOT NULL DEFAULT '', + pad CHAR(60) NOT NULL DEFAULT '', + PRIMARY KEY (id), + KEY k_1 (k) +) ENGINE=TIDESDB SYNC_MODE='NONE'; + +--echo # +--echo # === Populate: 5000 rows per table === +--echo # + +--disable_query_log +--disable_result_log + +let $i= 1; +while ($i <= 5000) +{ + eval INSERT INTO sbtest1 (k, c, pad) VALUES ( + FLOOR(RAND() * 100000), + REPEAT('a', 120), + REPEAT('b', 60) + ); + eval INSERT INTO sbtest2 (k, c, pad) VALUES ( + FLOOR(RAND() * 100000), + REPEAT('a', 120), + REPEAT('b', 60) + ); + inc $i; +} + +--enable_result_log +--enable_query_log + +SELECT COUNT(*) AS sbtest1_rows FROM sbtest1; +SELECT COUNT(*) AS sbtest2_rows FROM sbtest2; + +--echo # +--echo # ============================================ +--echo # TEST 1: Single-connection write-only storm +--echo # 1000 write-only transactions on one connection. +--echo # Exercises rapid txn_begin/commit/free cycling. +--echo # ============================================ +--echo # + +--disable_query_log +--disable_result_log + +let $txn= 1; +while ($txn <= 1000) +{ + BEGIN; + # 4 UPDATEs (2 on indexed col k, 2 on non-indexed col c) + eval UPDATE sbtest1 SET k = k + 1 WHERE id = 1 + ($txn % 5000); + eval UPDATE sbtest1 SET c = REPEAT(CHAR(65 + ($txn % 26)), 120) WHERE id = 1 + (($txn + 1000) % 5000); + eval UPDATE sbtest1 SET k = k + 1 WHERE id = 1 + (($txn + 2000) % 5000); + eval UPDATE sbtest1 SET c = REPEAT(CHAR(65 + ($txn % 26)), 120) WHERE id = 1 + (($txn + 3000) % 5000); + # 1 DELETE + 1 INSERT (net zero row count change) + eval DELETE FROM sbtest1 WHERE id = 1 + (($txn + 4000) % 5000); + eval INSERT INTO sbtest1 (k, c, pad) VALUES (FLOOR(RAND() * 100000), REPEAT('x', 120), REPEAT('y', 60)); + COMMIT; + inc $txn; +} + +--enable_result_log +--enable_query_log + +SELECT COUNT(*) AS after_single FROM sbtest1; + +--echo # +--echo # ============================================ +--echo # TEST 2: Concurrent write-only storm (4 connections) +--echo # Each connection runs 500 write-only transactions +--echo # hitting both tables. Conflicts are expected. +--echo # ============================================ +--echo # + +connect (wr1, localhost, root,,); +connect (wr2, localhost, root,,); +connect (wr3, localhost, root,,); +connect (wr4, localhost, root,,); + +--disable_query_log +--disable_result_log + +# ---- Connection wr1: writes to sbtest1 ---- +connection wr1; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest1 SET k = k + 1 WHERE id = 1 + (@i % 5000); + UPDATE sbtest1 SET c = REPEAT('A', 120) WHERE id = 1 + ((@i + 500) % 5000); + UPDATE sbtest1 SET k = k - 1 WHERE id = 1 + ((@i + 1000) % 5000); + UPDATE sbtest1 SET c = REPEAT('B', 120) WHERE id = 1 + ((@i + 1500) % 5000); + DELETE FROM sbtest1 WHERE id = 1 + ((@i + 2000) % 5000); + INSERT INTO sbtest1 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('w',120), REPEAT('z',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Connection wr2: writes to sbtest1 (overlapping with wr1 -> conflicts) ---- +connection wr2; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest1 SET k = k + 1 WHERE id = 1 + ((@i + 250) % 5000); + UPDATE sbtest1 SET c = REPEAT('C', 120) WHERE id = 1 + ((@i + 750) % 5000); + UPDATE sbtest1 SET k = k - 1 WHERE id = 1 + ((@i + 1250) % 5000); + UPDATE sbtest1 SET c = REPEAT('D', 120) WHERE id = 1 + ((@i + 1750) % 5000); + DELETE FROM sbtest1 WHERE id = 1 + ((@i + 2250) % 5000); + INSERT INTO sbtest1 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('w',120), REPEAT('z',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Connection wr3: writes to sbtest2 ---- +connection wr3; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest2 SET k = k + 1 WHERE id = 1 + (@i % 5000); + UPDATE sbtest2 SET c = REPEAT('E', 120) WHERE id = 1 + ((@i + 500) % 5000); + UPDATE sbtest2 SET k = k - 1 WHERE id = 1 + ((@i + 1000) % 5000); + UPDATE sbtest2 SET c = REPEAT('F', 120) WHERE id = 1 + ((@i + 1500) % 5000); + DELETE FROM sbtest2 WHERE id = 1 + ((@i + 2000) % 5000); + INSERT INTO sbtest2 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('w',120), REPEAT('z',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Connection wr4: writes to sbtest2 (overlapping with wr3 -> conflicts) ---- +connection wr4; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest2 SET k = k + 1 WHERE id = 1 + ((@i + 250) % 5000); + UPDATE sbtest2 SET c = REPEAT('G', 120) WHERE id = 1 + ((@i + 750) % 5000); + UPDATE sbtest2 SET k = k - 1 WHERE id = 1 + ((@i + 1250) % 5000); + UPDATE sbtest2 SET c = REPEAT('H', 120) WHERE id = 1 + ((@i + 1750) % 5000); + DELETE FROM sbtest2 WHERE id = 1 + ((@i + 2250) % 5000); + INSERT INTO sbtest2 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('w',120), REPEAT('z',60)); + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +# ---- Reap all ---- +connection wr1; +reap; + +connection wr2; +reap; + +connection wr3; +reap; + +connection wr4; +reap; + +--enable_result_log +--enable_query_log + +connection default; + +--echo # +--echo # === Verify data integrity after concurrent writes === +--echo # + +# Row counts are non-deterministic due to conflicts; just verify +# PK scan == index scan (data/index consistency) and no crash. +let $pk1 = `SELECT COUNT(*) FROM sbtest1`; +let $pk2 = `SELECT COUNT(*) FROM sbtest2`; +let $idx1 = `SELECT COUNT(*) FROM sbtest1 WHERE k >= 0 OR k < 0`; +let $idx2 = `SELECT COUNT(*) FROM sbtest2 WHERE k >= 0 OR k < 0`; + +--disable_query_log +if ($pk1 != $idx1) +{ + --echo FAIL: sbtest1 PK count ($pk1) != index count ($idx1) +} +if ($pk2 != $idx2) +{ + --echo FAIL: sbtest2 PK count ($pk2) != index count ($idx2) +} +--enable_query_log +--echo PK/index consistency: OK + +--echo # +--echo # ============================================ +--echo # TEST 3: Rapid txn churn (commit + immediate new txn) +--echo # 1000 tiny autocommit writes per connection x 4 connections +--echo # Tests rapid txn_begin/txn_free cycling without BEGIN/COMMIT +--echo # ============================================ +--echo # + +--disable_query_log +--disable_result_log + +connection wr1; +delimiter |; +send + SET @i = 1; + WHILE @i <= 1000 DO + UPDATE sbtest1 SET k = k + 1 WHERE id = 1 + (@i % 5000); + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr2; +delimiter |; +send + SET @i = 1; + WHILE @i <= 1000 DO + UPDATE sbtest1 SET k = k - 1 WHERE id = 1 + ((@i + 500) % 5000); + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr3; +delimiter |; +send + SET @i = 1; + WHILE @i <= 1000 DO + INSERT INTO sbtest1 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('q',120), REPEAT('r',60)); + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr4; +delimiter |; +send + SET @i = 1; + WHILE @i <= 1000 DO + INSERT INTO sbtest2 (k, c, pad) VALUES (FLOOR(RAND()*100000), REPEAT('q',120), REPEAT('r',60)); + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr1; +reap; +connection wr2; +reap; +connection wr3; +reap; +connection wr4; +reap; + +--enable_result_log +--enable_query_log + +--echo # +--echo # ============================================ +--echo # TEST 4: Conflict storm -- all 4 connections hit same rows +--echo # Maximizes TDB_ERR_CONFLICT / ERROR 1180 rate. +--echo # Exercises the failed-commit -> txn_free -> new txn_begin path. +--echo # ============================================ +--echo # + +--disable_query_log +--disable_result_log + +connection wr1; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest1 SET k = @i WHERE id = 1; + UPDATE sbtest1 SET k = @i WHERE id = 2; + UPDATE sbtest1 SET k = @i WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr2; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest1 SET k = @i + 10000 WHERE id = 1; + UPDATE sbtest1 SET k = @i + 10000 WHERE id = 2; + UPDATE sbtest1 SET k = @i + 10000 WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr3; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest1 SET k = @i + 20000 WHERE id = 1; + UPDATE sbtest1 SET k = @i + 20000 WHERE id = 2; + UPDATE sbtest1 SET k = @i + 20000 WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr4; +delimiter |; +send + SET @i = 1; + WHILE @i <= 500 DO + BEGIN NOT ATOMIC + DECLARE CONTINUE HANDLER FOR 1180, 1213, 1205 + BEGIN END; + START TRANSACTION; + UPDATE sbtest1 SET k = @i + 30000 WHERE id = 1; + UPDATE sbtest1 SET k = @i + 30000 WHERE id = 2; + UPDATE sbtest1 SET k = @i + 30000 WHERE id = 3; + COMMIT; + END; + SET @i = @i + 1; + END WHILE; +| +delimiter ;| + +connection wr1; +reap; +connection wr2; +reap; +connection wr3; +reap; +connection wr4; +reap; + +--enable_result_log +--enable_query_log + +connection default; + +# Rows 1-3 may have been deleted by concurrent DELETEs in earlier tests; +# just verify we can query without error (no crash/corruption). +--disable_result_log +SELECT COUNT(*) FROM sbtest1 WHERE id IN (1, 2, 3); +--enable_result_log +--echo Conflict storm: OK + +--echo # +--echo # === Cleanup === +--echo # + +disconnect wr1; +disconnect wr2; +disconnect wr3; +disconnect wr4; + +DROP TABLE sbtest1; +DROP TABLE sbtest2; + +--source suite/tidesdb/include/cleanup_tidesdb.inc +--echo # Done. diff --git a/storage/tidesdb/CMakeLists.txt b/storage/tidesdb/CMakeLists.txt new file mode 100644 index 0000000000000..c6d0c846940b2 --- /dev/null +++ b/storage/tidesdb/CMakeLists.txt @@ -0,0 +1,177 @@ +# Copyright (c) 2026 TidesDB Corp. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +# TidesDB storage engine. +# +# The libtidesdb C library is vendored under libtidesdb/ and compiled into a +# static archive (tidesdb_embedded) that is statically linked into the plugin +# module. This mirrors how storage/rocksdb builds rocksdblib: a self-contained +# build with no dependency on a system-installed libtidesdb, so a normal +# MariaDB build produces a working engine and stale /usr/local copies cannot +# shadow it. + +# +# Do not route compression through MariaDB's loadable provider services. +# +# MariaDB injects include/providers/ globally (CMakeLists.txt), whose lz4.h / +# snappy-c.h shims rewrite the C compression calls to go through provider +# plugins that must be separately installed. libtidesdb expects to call the +# real zstd/lz4/snappy directly, so drop that include dir for this engine and +# link the system libraries instead (mirrors storage/rocksdb). +# +GET_PROPERTY(dirs DIRECTORY PROPERTY INCLUDE_DIRECTORIES) +LIST(REMOVE_ITEM dirs ${CMAKE_SOURCE_DIR}/include/providers) +SET_PROPERTY(DIRECTORY PROPERTY INCLUDE_DIRECTORIES "${dirs}") + +# +# Mandatory compression libraries. +# +# libtidesdb's compress.h unconditionally includes , and +# , so all three are required to build the engine at all. Resolve +# full paths with FIND_LIBRARY rather than pkg-config: pkg-config returns bare +# names like "zstd" which collide with MariaDB's own "zstd" MODULE target +# (the provider_zstd plugin). +# +FIND_LIBRARY(TIDESDB_ZSTD_LIB NAMES zstd) +FIND_LIBRARY(TIDESDB_LZ4_LIB NAMES lz4) +FIND_LIBRARY(TIDESDB_SNAPPY_LIB NAMES snappy) +FIND_PATH(TIDESDB_ZSTD_INC NAMES zstd.h) +FIND_PATH(TIDESDB_LZ4_INC NAMES lz4.h) +FIND_PATH(TIDESDB_SNAPPY_INC NAMES snappy-c.h) + +IF(NOT TIDESDB_ZSTD_LIB OR NOT TIDESDB_LZ4_LIB OR NOT TIDESDB_SNAPPY_LIB OR + NOT TIDESDB_ZSTD_INC OR NOT TIDESDB_LZ4_INC OR NOT TIDESDB_SNAPPY_INC) + MESSAGE(STATUS "TidesDB: zstd/lz4/snappy development libraries not all found " + "- skipping the TidesDB storage engine. Install libzstd-dev, " + "liblz4-dev and libsnappy-dev to enable it.") + RETURN() +ENDIF() + +# +# Optional S3-compatible object store connector (libcurl + OpenSSL). +# On by default; degrades gracefully to a non-S3 build if the deps are absent. +# +OPTION(TIDESDB_WITH_S3 "Build the TidesDB S3-compatible object store connector" ON) +SET(TIDESDB_S3_LIBS) +IF(TIDESDB_WITH_S3) + FIND_PACKAGE(CURL) + FIND_PACKAGE(OpenSSL) + IF(CURL_FOUND AND OPENSSL_FOUND) + SET(TIDESDB_S3_LIBS CURL::libcurl OpenSSL::SSL OpenSSL::Crypto) + MESSAGE(STATUS "TidesDB: S3 object store connector enabled") + ELSE() + MESSAGE(STATUS "TidesDB: libcurl/OpenSSL not found - building without S3 connector") + SET(TIDESDB_WITH_S3 OFF) + ENDIF() +ENDIF() + +# +# The plugin module. Created first so that, if the engine is not requested +# (-DPLUGIN_TIDESDB=NO), we RETURN before building the vendored library. +# +MYSQL_ADD_PLUGIN(tidesdb ha_tidesdb.cc + STORAGE_ENGINE MODULE_ONLY + COMPONENT tidesdb-engine) + +IF(NOT TARGET tidesdb) + RETURN() +ENDIF() + +# +# Vendored libtidesdb -> static archive linked into the plugin. +# +# The vendored tree keeps libtidesdb's upstream src/ + external/ layout so that +# its internal relative includes (e.g. src/clock_cache.c -> "../external/xxhash.h") +# resolve unchanged. +SET(TIDESDB_LIB_DIR ${CMAKE_CURRENT_SOURCE_DIR}/libtidesdb) + +SET(TIDESDB_CORE_SOURCES + ${TIDESDB_LIB_DIR}/src/tidesdb.c + ${TIDESDB_LIB_DIR}/src/block_manager.c + ${TIDESDB_LIB_DIR}/src/skip_list.c + ${TIDESDB_LIB_DIR}/src/compress.c + ${TIDESDB_LIB_DIR}/src/bloom_filter.c + ${TIDESDB_LIB_DIR}/src/manifest.c + ${TIDESDB_LIB_DIR}/src/clock_cache.c + ${TIDESDB_LIB_DIR}/src/queue.c + ${TIDESDB_LIB_DIR}/src/btree.c + ${TIDESDB_LIB_DIR}/src/alloc.c + ${TIDESDB_LIB_DIR}/src/objstore_fs.c + ${TIDESDB_LIB_DIR}/src/local_cache.c + ${TIDESDB_LIB_DIR}/external/xxhash.c + ${TIDESDB_LIB_DIR}/external/ini.c +) +IF(TIDESDB_WITH_S3) + LIST(APPEND TIDESDB_CORE_SOURCES ${TIDESDB_LIB_DIR}/src/objstore_s3.c) +ENDIF() + +ADD_LIBRARY(tidesdb_embedded STATIC ${TIDESDB_CORE_SOURCES}) + +# Vendored third-party C: build position-independent (it links into a module), +# as C11, and silence its warnings so they never fail a -Werror/maintainer build. +SET_TARGET_PROPERTIES(tidesdb_embedded PROPERTIES + C_STANDARD 11 + POSITION_INDEPENDENT_CODE ON) + +TARGET_INCLUDE_DIRECTORIES(tidesdb_embedded PRIVATE + ${TIDESDB_LIB_DIR}/src # internal "tidesdb.h" includes + ${TIDESDB_LIB_DIR}/external + ${TIDESDB_ZSTD_INC} + ${TIDESDB_LZ4_INC} + ${TIDESDB_SNAPPY_INC}) + +TARGET_COMPILE_DEFINITIONS(tidesdb_embedded PRIVATE + _GNU_SOURCE + $<$:TIDESDB_WITH_S3>) + +IF(NOT MSVC) + TARGET_COMPILE_OPTIONS(tidesdb_embedded PRIVATE -w) +ENDIF() + +# Propagate the runtime dependencies to whatever links the archive (the plugin). +IF(HAVE_GCC_C11_ATOMICS_WITH_LIBATOMIC) + SET(TIDESDB_ATOMIC_LIBS -latomic) +ENDIF() + +TARGET_LINK_LIBRARIES(tidesdb_embedded PUBLIC + ${TIDESDB_ZSTD_LIB} + ${TIDESDB_LZ4_LIB} + ${TIDESDB_SNAPPY_LIB} + ${TIDESDB_S3_LIBS} + ${TIDESDB_ATOMIC_LIBS} + ${CMAKE_THREAD_LIBS_INIT} + ${LIBM}) + +# +# Wire the static library into the plugin. +# +# ha_tidesdb.cc uses installed-style includes (, ) +# that expect every public header flattened under a single tidesdb/ directory. +# Assemble that layout in the build tree from the upstream src/ + external/ +# headers, then point the plugin's include path at its parent. +# +SET(TIDESDB_PUBLIC_INC ${CMAKE_CURRENT_BINARY_DIR}/include) +FILE(MAKE_DIRECTORY ${TIDESDB_PUBLIC_INC}/tidesdb) +FILE(COPY ${TIDESDB_LIB_DIR}/src/ DESTINATION ${TIDESDB_PUBLIC_INC}/tidesdb + FILES_MATCHING PATTERN "*.h") +FILE(COPY ${TIDESDB_LIB_DIR}/external/ DESTINATION ${TIDESDB_PUBLIC_INC}/tidesdb + FILES_MATCHING PATTERN "*.h") + +TARGET_INCLUDE_DIRECTORIES(tidesdb PRIVATE ${TIDESDB_PUBLIC_INC}) +TARGET_LINK_LIBRARIES(tidesdb tidesdb_embedded) + +IF(TIDESDB_WITH_S3) + TARGET_COMPILE_DEFINITIONS(tidesdb PRIVATE TIDESDB_WITH_S3) +ENDIF() diff --git a/storage/tidesdb/ha_tidesdb.cc b/storage/tidesdb/ha_tidesdb.cc new file mode 100644 index 0000000000000..e4f8a704866b8 --- /dev/null +++ b/storage/tidesdb/ha_tidesdb.cc @@ -0,0 +1,11258 @@ +/* + Copyright (c) 2026 TidesDB Corp. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include "ha_tidesdb.h" + +extern "C" +{ +#define XXH_INLINE_ALL +#include +#ifdef TIDESDB_WITH_S3 + tidesdb_objstore_t *tidesdb_objstore_s3_create(const char *endpoint, const char *bucket, + const char *prefix, const char *access_key, + const char *secret_key, const char *region, + int use_ssl, int use_path_style); +#endif +} + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "key.h" +#include "sql_class.h" +#include "sql_priv.h" + +/* MariaDB 12.3.1 (MDEV-37815) renamed TABLE_SHARE::option_struct to + option_struct_table and introduced handler::option_struct as the preferred + accessor. We keep reading from TABLE_SHARE so the macro works from + create(), inplace alter, and free functions that only have a TABLE*. */ +#if MYSQL_VERSION_ID >= 120301 +#define TDB_TABLE_OPTIONS(tbl) ((tbl)->s->option_struct_table) +#else +#define TDB_TABLE_OPTIONS(tbl) ((tbl)->s->option_struct) +#endif + +/* Forward-declared for tdb_rc_to_ha(); defined with sysvars below */ +static my_bool srv_print_all_conflicts = 0; +static my_bool srv_pessimistic_locking = 1; +static mysql_mutex_t last_conflict_mutex; +/* Buffer for the most recent conflict diagnostic surfaced under + `Last conflict:` in SHOW ENGINE TIDESDB STATUS. Sized comfortably above + any expected single-line message; updates are bounded by snprintf with + sizeof() so the constant only appears here. */ +static constexpr size_t LAST_CONFLICT_INFO_LEN = 1024; +static char last_conflict_info[LAST_CONFLICT_INFO_LEN] = ""; + +/* + Map TidesDB library error codes to MariaDB handler error codes. + Transient errors (conflict, lock contention, memory pressure) are mapped + to HA_ERR_LOCK_DEADLOCK so that MariaDB's deadlock-retry logic kicks in + and applications can retry automatically instead of + receiving the opaque HA_ERR_GENERIC / ER_GET_ERRNO 1030. +*/ +static int tdb_rc_to_ha(int rc, const char *ctx) +{ + switch (rc) + { + case TDB_SUCCESS: + return 0; + + /* Transient concurrency errors -- mapped to deadlock so MariaDB + rolls back the transaction and the application can retry. */ + case TDB_ERR_CONFLICT: + if (unlikely(srv_print_all_conflicts)) + { + sql_print_information( + "[TIDESDB] %s: transaction aborted due to write-write " + "conflict (TDB_ERR_CONFLICT)", + ctx); + mysql_mutex_lock(&last_conflict_mutex); + snprintf(last_conflict_info, sizeof(last_conflict_info), "Last conflict: %s at %ld", + ctx, (long)time(NULL)); + mysql_mutex_unlock(&last_conflict_mutex); + } + return HA_ERR_LOCK_DEADLOCK; + + /* Lock wait timeout -- rolls back the current statement only + (not the whole transaction), less disruptive than full deadlock. */ + case TDB_ERR_LOCKED: + return HA_ERR_LOCK_WAIT_TIMEOUT; + + /* Back-pressure signal from the library (memtable / flush queue + / L0 backlog at soft cap). Callers that go through the + tdb_txn_*_blocking wrappers absorb this transparently by + waiting for capacity, so this fall-through path only fires + when the configured wait timeout has been exhausted or no + wrapper is in play -- in either case lock-wait-timeout is the + accurate name (not deadlock; nothing is locked). + + TDB_ERR_BUSY is the same family. The library now distinguishes + a soft cap (TDB_ERR_MEMORY_LIMIT) from the case where it has + stalled long enough that its internal no-progress budget was + spent without freeing capacity. Both are transient and the + plugin treats them the same once the in-plugin backoff has + given up. */ + case TDB_ERR_MEMORY_LIMIT: + case TDB_ERR_BUSY: + return HA_ERR_LOCK_WAIT_TIMEOUT; + + /* Hard out-of-memory. Distinct from TDB_ERR_MEMORY_LIMIT above + (a soft back-pressure signal); TDB_ERR_MEMORY means the + allocator itself failed. */ + case TDB_ERR_MEMORY: + sql_print_error("[TIDESDB] %s: TDB_ERR_MEMORY", ctx); + return HA_ERR_OUT_OF_MEM; + + case TDB_ERR_NOT_FOUND: + return HA_ERR_KEY_NOT_FOUND; + + case TDB_ERR_EXISTS: + return HA_ERR_FOUND_DUPP_KEY; + + case TDB_ERR_READONLY: + return HA_ERR_READ_ONLY_TRANSACTION; + + /* I/O and corruption errors -- table needs repair/recovery. + matches InnoDB's mapping of DB_CORRUPTION to HA_ERR_CRASHED. */ + case TDB_ERR_IO: + sql_print_error("[TIDESDB] %s: I/O error (TDB_ERR_IO)", ctx); + return HA_ERR_CRASHED; + + case TDB_ERR_CORRUPTION: + sql_print_error("[TIDESDB] %s: data corruption detected (TDB_ERR_CORRUPTION)", ctx); + return HA_ERR_CRASHED; + + /* Row too large for the configured block/value size. */ + case TDB_ERR_TOO_LARGE: + return HA_ERR_TO_BIG_ROW; + + /* Database handle invalid (closed or never opened). */ + case TDB_ERR_INVALID_DB: + sql_print_error("[TIDESDB] %s: invalid database handle (TDB_ERR_INVALID_DB)", ctx); + return HA_ERR_INTERNAL_ERROR; + + /* Invalid arguments -- programming error in the plugin. */ + case TDB_ERR_INVALID_ARGS: + sql_print_error("[TIDESDB] %s: invalid arguments (TDB_ERR_INVALID_ARGS)", ctx); + return HA_ERR_INTERNAL_ERROR; + + /* Unified-mode commit returns TDB_ERR_UNKNOWN when the active-memtable + try_ref retry budget is exhausted under heavy rotation contention -- + same family as TDB_ERR_CONFLICT from the caller's perspective. + Map to HA_ERR_LOCK_DEADLOCK so MariaDB triggers its deadlock + retry path instead of surfacing an opaque ER_GET_ERRNO 1030. */ + case TDB_ERR_UNKNOWN: + return HA_ERR_LOCK_DEADLOCK; + + default: + sql_print_warning("[TIDESDB] %s: unexpected TidesDB error rc=%d", ctx, rc); + return HA_ERR_GENERIC; + } +} + +/* + Dispatch to tidesdb_txn_single_delete or tidesdb_txn_delete based on + use_single_delete. Secondary-index delete sites pass true because the + single-delete contract (at most one put between single-deletes on the + same key) holds by construction for (col_values, pk) / (term, pk) / + (hilbert, pk) composites. Primary-CF delete sites pass the cached + value of the tidesdb_single_delete_primary session variable, which + defaults off and is the caller's explicit promise that the session + does no UPDATE on non-PK columns and no REPLACE INTO / IODKU overwrite + path on no-secondary tables. +*/ +static inline int tidesdb_txn_delete_cf(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, + const uint8_t *key, size_t key_size, bool use_single_delete) +{ + return use_single_delete ? tidesdb_txn_single_delete(txn, cf, key, key_size) + : tidesdb_txn_delete(txn, cf, key, key_size); +} + +/* ******************** Library back-pressure wait ******************** */ +/* + TDB_ERR_MEMORY_LIMIT is the library's soft back-pressure signal -- the + memtable / flush queue / L0 backlog is at its cap and the writer should + pause until flush+compaction free capacity. Surfacing that to the SQL + layer as HA_ERR_LOCK_DEADLOCK -- as earlier revisions did -- breaks + clients that treat 1213 as fatal and do not retry (bulk loaders, batch + ETL, schema-build scripts), failing entire sessions after long writes + even though nothing is locked and the engine just needs a moment to + drain. + + The put/commit/delete wrappers below sleep with exponential backoff + until the library accepts the operation again, the wait timeout + expires, or the connection is killed. After exhaustion the original + TDB_ERR_MEMORY_LIMIT bubbles up through tdb_rc_to_ha and maps to + HA_ERR_LOCK_WAIT_TIMEOUT, which is the accurate name (no lock is held). +*/ +static constexpr uint TDB_BACKPRESSURE_BACKOFF_MIN_US = 100; /* 0.1 ms initial */ +static constexpr uint TDB_BACKPRESSURE_BACKOFF_MAX_US = 50000; /* 50 ms cap */ +static constexpr uint TDB_BACKPRESSURE_BACKOFF_MULTIPLIER = 2; +static constexpr ulong TDB_BACKPRESSURE_DEFAULT_TIMEOUT_MS = 60000; /* 60 s default */ +static constexpr ulong TDB_BACKPRESSURE_MAX_TIMEOUT_MS = 3600000; /* 1 h max */ +static constexpr ulong TDB_BACKPRESSURE_MIN_TIMEOUT_MS = 0; /* 0 disables blocking */ +static constexpr uint TDB_BACKPRESSURE_KILL_CHECK_INTERVAL_US = 100000; /* 100 ms */ + +/* Pessimistic row-lock wait bounds. Default mirrors innodb_lock_wait_timeout + (50 seconds). 0 means wait indefinitely, bounded only by KILL QUERY. */ +static constexpr ulong TDB_LOCK_WAIT_DEFAULT_TIMEOUT_MS = 50000; +static constexpr ulong TDB_LOCK_WAIT_MIN_TIMEOUT_MS = 0; +static constexpr ulong TDB_LOCK_WAIT_MAX_TIMEOUT_MS = 3600000; +static constexpr ulonglong TDB_NS_PER_MS = 1000000ULL; +static constexpr ulonglong TDB_US_PER_S = 1000000ULL; + +/* Stats -- bumped from the wrapper, read by tidesdb_refresh_status_vars. */ +static std::atomic srv_stat_backpressure_waits{0}; +static std::atomic srv_stat_backpressure_wait_us{0}; +static std::atomic srv_stat_lock_waits{0}; +static std::atomic srv_stat_lock_wait_us{0}; +static std::atomic srv_stat_lock_deadlocks{0}; +static std::atomic srv_stat_lock_timeouts{0}; +static std::atomic srv_stat_lock_held{0}; +static std::atomic srv_stat_lock_entries{0}; +static std::atomic srv_stat_lock_entry_recycles{0}; +static std::atomic srv_stat_lock_chain_max{0}; + +static ulong tdb_backpressure_timeout_ms(THD *thd); +static ulong tdb_lock_wait_timeout_ms(THD *thd); + +/* + Per-statement back-pressure deadline. external_lock(F_WRLCK) seeds it to + now() + timeout_ms; F_UNLCK clears it. When valid every backpressure call + in the statement charges against the same shared deadline so a 5000-row + INSERT with N indexes cannot burn (1+N) * 5000 * timeout_ms of wall-clock. + Thread-local because each statement runs on one connection thread. +*/ +static thread_local std::chrono::steady_clock::time_point tdb_stmt_bp_deadline_{}; +static thread_local bool tdb_stmt_bp_deadline_valid_ = false; + +/* + Run op() and, if the library reports back-pressure, sleep with exponential + backoff and retry until success, timeout exhaustion, or connection kill. + The kill-check cadence is bounded so even a long sleep responds promptly + to KILL QUERY. After the deadline the unmodified TDB_ERR_MEMORY_LIMIT is + returned so the caller's existing error mapping still applies. +*/ +template +static int tdb_with_backpressure_wait(THD *thd, Op &&op) +{ + int rc = op(); + if (likely(rc != TDB_ERR_MEMORY_LIMIT && rc != TDB_ERR_BUSY)) return rc; + + const ulong timeout_ms = tdb_backpressure_timeout_ms(thd); + if (timeout_ms == 0) return rc; + + /* Prefer the per-statement deadline when external_lock has seeded it + so a multi-call statement (bulk INSERT/UPDATE/DELETE, ALTER) does + not multiply the budget by row count. */ + const auto deadline = + tdb_stmt_bp_deadline_valid_ + ? tdb_stmt_bp_deadline_ + : (std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms)); + uint sleep_us = TDB_BACKPRESSURE_BACKOFF_MIN_US; + bool counted = false; + long long waited_us = 0; + + while (rc == TDB_ERR_MEMORY_LIMIT || rc == TDB_ERR_BUSY) + { + if (thd && thd_killed(thd)) break; + + auto now = std::chrono::steady_clock::now(); + if (now >= deadline) break; + auto remaining_us = + std::chrono::duration_cast(deadline - now).count(); + uint capped_sleep_us = std::min(sleep_us, TDB_BACKPRESSURE_KILL_CHECK_INTERVAL_US); + if ((long long)capped_sleep_us > remaining_us) capped_sleep_us = (uint)remaining_us; + + std::this_thread::sleep_for(std::chrono::microseconds(capped_sleep_us)); + waited_us += capped_sleep_us; + if (!counted) + { + srv_stat_backpressure_waits.fetch_add(1, std::memory_order_relaxed); + counted = true; + } + sleep_us = std::min(sleep_us * TDB_BACKPRESSURE_BACKOFF_MULTIPLIER, + TDB_BACKPRESSURE_BACKOFF_MAX_US); + rc = op(); + } + + if (waited_us > 0) + srv_stat_backpressure_wait_us.fetch_add(waited_us, std::memory_order_relaxed); + return rc; +} + +/* Thin wrappers around the three library write entry points that can return + TDB_ERR_MEMORY_LIMIT under sustained write load. Other callers that do + not go through these still get the accurate HA_ERR_LOCK_WAIT_TIMEOUT + mapping from tdb_rc_to_ha but without the in-plugin block. */ +static inline int tdb_txn_put_blocking(THD *thd, tidesdb_txn_t *txn, tidesdb_column_family_t *cf, + const uint8_t *key, size_t key_size, const uint8_t *value, + size_t value_size, time_t ttl) +{ + return tdb_with_backpressure_wait( + thd, [&]() { return tidesdb_txn_put(txn, cf, key, key_size, value, value_size, ttl); }); +} + +static inline int tdb_txn_commit_blocking(THD *thd, tidesdb_txn_t *txn) +{ + return tdb_with_backpressure_wait(thd, [&]() { return tidesdb_txn_commit(txn); }); +} + +static inline int tdb_txn_delete_cf_blocking(THD *thd, tidesdb_txn_t *txn, + tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size, bool use_single_delete) +{ + return tdb_with_backpressure_wait( + thd, [&]() { return tidesdb_txn_delete_cf(txn, cf, key, key_size, use_single_delete); }); +} + +/* Iterator construction can return TDB_ERR_BUSY when the library's reader fd + soft cap is exhausted, and TDB_ERR_IO when an SSTable open fails after the + budget check passed (EMFILE between the check and the open). The library + documents both as retryable -- the in-line comment at the IO site reads + "let the caller retry once descriptors free." Routing iter_new through + the backpressure helper waits it out instead of immediately surfacing + HA_ERR_LOCK_WAIT_TIMEOUT (BUSY) or HA_ERR_CRASHED (IO). The IO -> BUSY + translation is scoped to this wrapper so other call sites still treat a + real TDB_ERR_IO as a hard fault via tdb_rc_to_ha; the wrapper's existing + tidesdb_backpressure_wait_timeout_ms bound stops a genuine disk failure + from hanging forever. */ +static inline int tdb_iter_new_blocking(THD *thd, tidesdb_txn_t *txn, tidesdb_column_family_t *cf, + tidesdb_iter_t **out) +{ + return tdb_with_backpressure_wait(thd, + [&]() + { + int rc = tidesdb_iter_new(txn, cf, out); + if (rc == TDB_ERR_IO) rc = TDB_ERR_BUSY; + return rc; + }); +} + +/* MariaDB data directory */ +extern MYSQL_PLUGIN_IMPORT char mysql_real_data_home[]; + +/* Global TidesDB database handle */ +static tidesdb_t *tdb_global = NULL; +static std::string tdb_path; + +/* Schema discovery CF for object store mode (NULL when local-only) */ +static tidesdb_column_family_t *schema_cf = NULL; + +static handlerton *tidesdb_hton; + +/* ******************** Plugin-level row lock table ******************** */ +/* + Hash-table-based row-level lock manager with two modes (S, X), wait queue + for fairness, and best-effort deadlock detection. + + Design: + - Hash partitions over XXH3 of the row key, sized at init from hardware + concurrency. Each partition has its own mutex, an active hash chain + of lock entries, and a per-partition freelist of slots whose granted + and waiting lists are both empty. + - Each lock entry has two intrusive lists, both mutex-guarded: + granted_head -- currently-granted requests on this row + waiting_head -- FIFO of requests still waiting + - Each request (tdb_lock_request_t) ties (trx, lock, mode) together and + threads onto trx->held_locks_head (granted) or trx->waiting_on (waiting). + - Compatibility S/S is compatible; S/X and X/X are not. A new S also + blocks when an X is waiting, so writers cannot be starved by a stream + of readers. + - Re-entry on the same lock, if this trx already holds it in a mode + compatible with the request (X subsumes S; S satisfies S), return 0. + Upgrade S->X is allowed only when this trx is the sole granted holder + AND no waiters exist; otherwise we reject as HA_ERR_LOCK_DEADLOCK + rather than introduce a self-deadlock with our own S-grant. + - For deadlock detection, when we wait on a lock, walk every granted holder's + wait-for chain. Loads are atomic, lock entry memory is never my_free'd + during runtime, so a stale read can only produce a false-positive + (caller retries) or a false-negative (caller times out via + lock-wait-timeout) -- never memory corruption. + - Release walks the trx's held_locks_head, unlinks each request from its + lock's granted list, promotes any waiting requests now compatible with + the remaining granted set, broadcasts the lock's cond, and moves the + lock entry onto the partition's freelist if no granted or waiting + requests remain. Slot memory is retained for the deadlock walker but + the entry leaves the hash chain so lookups stay O(active locks per + partition) rather than O(lifetime keys). +*/ + +/* Number of hash partitions for the row lock table. Sized at init from + hardware_concurrency to 8 * cores, clamped to [128, 65536]. The + upper cap stays at the historical value so a huge box still gets + plenty of partitions; the lower bound guarantees decent stripe count + even on single-vCPU containers. Each partition is cache-line + aligned so unrelated stripes do not false-share. */ +static ulong row_lock_partitions = 0; +static constexpr ulong ROW_LOCK_PARTITIONS_MIN = 128; +static constexpr ulong ROW_LOCK_PARTITIONS_MAX = 65536; + +/* Maximum depth for wait-for-graph traversal during deadlock detection. */ +static constexpr int DEADLOCK_MAX_DEPTH = 100; + +/* tdb_lock_mode_t is declared in ha_tidesdb.h so the trx struct can name it. */ + +/* Lock request -- one per (trx, lock, mode) instance. + Lifetime is allocated in row_lock_acquire, freed when the trx releases + the lock (commit/rollback) or when the wait is aborted (deadlock, + timeout, kill). Lives on exactly one of: + - lock->granted_head (after grant) + trx->held_locks_head + - lock->waiting_head (before grant) + trx->waiting_on + list_next chains the per-lock list (granted or waiting). + held_next chains the per-trx held-list. */ +struct tdb_lock_request_t +{ + tidesdb_trx_t *trx; + struct tdb_row_lock_t *lock; + tdb_lock_mode_t mode; + bool granted; + tdb_lock_request_t *list_next; /* in lock->granted_head OR lock->waiting_head */ + tdb_lock_request_t *held_next; /* in trx->held_locks_head (granted requests only) */ +}; + +/* Lock-table entry. Granted and waiting lists are mutex-guarded by the + owning partition's mutex. Lock entry memory is never my_free'd during + runtime (only at plugin deinit), so deadlock walkers can read these + pointers from other partitions without worrying about freed memory. + An entry is either threaded into part->chain (active) or part->freelist + (idle); the hash_next field doubles as the freelist link when idle. */ +struct tdb_row_lock_t +{ + uchar *pk; /* heap-allocated key bytes */ + uint pk_len; /* length of key bytes */ + tdb_lock_request_t *granted_head; /* mutex-guarded */ + tdb_lock_request_t *waiting_head; /* mutex-guarded FIFO head */ + tdb_lock_request_t *waiting_tail; /* mutex-guarded FIFO tail; lets append + skip the O(n) walk to find it */ + mysql_cond_t cond; /* waiters sleep on this */ + tdb_row_lock_t *hash_next; /* mutex-guarded; chain when active, + freelist when idle */ + uint partition; /* which partition (cached for release) */ +}; + +/* Cache-line aligned so unrelated partitions never share a 64 B line and + ping-pong on every acquire. alignas(64) also forces sizeof(struct) to + round up to a 64 B multiple, so the partition array indexes line up + with cache lines. */ +struct alignas(64) tdb_lock_partition_t +{ + mysql_mutex_t mutex; + tdb_row_lock_t *chain; /* head of active hash chain */ + tdb_row_lock_t *freelist; /* head of idle-slot list, reuse before malloc */ +}; + +static tdb_lock_partition_t *lock_partitions = NULL; + +static inline uint tdb_lock_part(const uchar *key, uint len) +{ + uint64_t h = XXH3_64bits(key, len); + return (uint)(h % row_lock_partitions); +} + +/* S/S compatible; everything else conflicts. */ +static inline bool tdb_lock_modes_compatible(tdb_lock_mode_t held, tdb_lock_mode_t want) +{ + return held == TDB_LOCK_MODE_S && want == TDB_LOCK_MODE_S; +} + +/* If the slot has no granted or waiting requests, unlink it from the + partition's active chain and push it onto the freelist. Caller must + hold the partition mutex. Slot memory survives so the deadlock + walker can still safely dereference any cross-partition pointer it + captured before we dropped the cross-partition mutex. */ +static inline void tdb_lock_freelist_if_empty(tdb_lock_partition_t *part, tdb_row_lock_t *lock) +{ + if (lock->granted_head != NULL || lock->waiting_head != NULL) return; + tdb_row_lock_t **cp = &part->chain; + while (*cp && *cp != lock) cp = &(*cp)->hash_next; + if (*cp == lock) + { + *cp = lock->hash_next; + lock->hash_next = part->freelist; + part->freelist = lock; + } +} + +/* Find or create a lock entry in the partition's hash chain. + Caller must hold partition mutex. + + The chain holds only entries with at least one granted or waiting + request, so its length tracks concurrent active locks for this + partition, not lifetime keys. Released slots are unlinked from the + chain and pushed onto part->freelist by row_locks_release_all; we pop + from the freelist before mallocing. Slot memory is retained across + reuse so lock-free deadlock walkers from other partitions can still + safely dereference any tdb_row_lock_t pointer they captured before we + dropped the cross-partition mutex. Walkers never dereference an + entry's pk, so a key rewrite during freelist reuse is invisible to + them, and the partition field stays stable because slots are only + ever reused within their original partition. */ +static tdb_row_lock_t *tdb_lock_find_or_create(tdb_lock_partition_t *part, uint part_idx, + const uchar *pk, uint pk_len) +{ + ulong chain_len = 0; + for (tdb_row_lock_t *e = part->chain; e; e = e->hash_next) + { + chain_len++; + if (e->pk_len == pk_len && memcmp(e->pk, pk, pk_len) == 0) + { + long long prev = srv_stat_lock_chain_max.load(std::memory_order_relaxed); + while ((long long)chain_len > prev && + !srv_stat_lock_chain_max.compare_exchange_weak(prev, (long long)chain_len, + std::memory_order_relaxed)) + ; + return e; + } + } + /* Sample chain depth after a miss as well, so a single-row hotspot + behind a long chain still surfaces in status. */ + { + long long prev = srv_stat_lock_chain_max.load(std::memory_order_relaxed); + while ((long long)chain_len > prev && + !srv_stat_lock_chain_max.compare_exchange_weak(prev, (long long)chain_len, + std::memory_order_relaxed)) + ; + } + + if (part->freelist) + { + tdb_row_lock_t *e = part->freelist; + part->freelist = e->hash_next; + uchar *new_pk = (uchar *)my_malloc(PSI_NOT_INSTRUMENTED, pk_len, MYF(0)); + if (!new_pk) + { + /* Put the slot back so the next caller can try again. */ + e->hash_next = part->freelist; + part->freelist = e; + return NULL; + } + memcpy(new_pk, pk, pk_len); + my_free(e->pk); + e->pk = new_pk; + e->pk_len = pk_len; + /* granted_head and waiting_head were NULL when the slot was + freelisted; cond/partition stay across reuse. */ + e->hash_next = part->chain; + part->chain = e; + srv_stat_lock_entry_recycles.fetch_add(1, std::memory_order_relaxed); + return e; + } + + tdb_row_lock_t *e = + (tdb_row_lock_t *)my_malloc(PSI_NOT_INSTRUMENTED, sizeof(tdb_row_lock_t), MYF(MY_ZEROFILL)); + if (!e) return NULL; + e->pk = (uchar *)my_malloc(PSI_NOT_INSTRUMENTED, pk_len, MYF(0)); + if (!e->pk) + { + my_free(e); + return NULL; + } + memcpy(e->pk, pk, pk_len); + e->pk_len = pk_len; + e->granted_head = NULL; + e->waiting_head = NULL; + e->partition = part_idx; + mysql_cond_init(0, &e->cond, NULL); + e->hash_next = part->chain; + part->chain = e; + srv_stat_lock_entries.fetch_add(1, std::memory_order_relaxed); + return e; +} + +static tdb_lock_request_t *tdb_lock_request_alloc(tidesdb_trx_t *trx, tdb_row_lock_t *lock, + tdb_lock_mode_t mode, bool granted) +{ + tdb_lock_request_t *req = (tdb_lock_request_t *)my_malloc( + PSI_NOT_INSTRUMENTED, sizeof(tdb_lock_request_t), MYF(MY_ZEROFILL)); + if (!req) return NULL; + req->trx = trx; + req->lock = lock; + req->mode = mode; + req->granted = granted; + req->list_next = NULL; + req->held_next = NULL; + return req; +} + +/* Find the granted request held by `trx` on `lock`, or NULL. + Caller must hold partition mutex. */ +static tdb_lock_request_t *tdb_lock_find_self_granted(tdb_row_lock_t *lock, tidesdb_trx_t *trx) +{ + for (tdb_lock_request_t *r = lock->granted_head; r; r = r->list_next) + { + if (r->trx == trx) return r; + } + return NULL; +} + +/* Append req to the lock's waiting FIFO. Caller must hold partition mutex. + The lock keeps a tail pointer so the append is O(1) instead of walking + the queue, which under contention could turn appending into O(n^2). */ +static void tdb_lock_waiting_append(tdb_row_lock_t *lock, tdb_lock_request_t *req) +{ + req->list_next = NULL; + if (!lock->waiting_head) + { + lock->waiting_head = req; + lock->waiting_tail = req; + return; + } + lock->waiting_tail->list_next = req; + lock->waiting_tail = req; +} + +/* Remove req from the lock's waiting list (if present). Caller must hold + partition mutex. Safe to call when req is not on the list. */ +static void tdb_lock_waiting_remove(tdb_row_lock_t *lock, tdb_lock_request_t *req) +{ + tdb_lock_request_t **pp = &lock->waiting_head; + tdb_lock_request_t *prev = NULL; + while (*pp && *pp != req) + { + prev = *pp; + pp = &(*pp)->list_next; + } + if (*pp == req) + { + *pp = req->list_next; + if (lock->waiting_tail == req) lock->waiting_tail = prev; + req->list_next = NULL; + } +} + +/* Can a new request of mode `want` be granted given the current granted set? + For S, also blocks if any waiting X exists (writer fairness). + Caller must hold partition mutex. */ +static bool tdb_lock_can_grant(tdb_row_lock_t *lock, tdb_lock_mode_t want, tidesdb_trx_t *self) +{ + for (tdb_lock_request_t *r = lock->granted_head; r; r = r->list_next) + { + if (r->trx == self) continue; /* self never blocks self */ + if (!tdb_lock_modes_compatible(r->mode, want)) return false; + } + if (want == TDB_LOCK_MODE_S) + { + for (tdb_lock_request_t *r = lock->waiting_head; r; r = r->list_next) + { + if (r->trx == self) continue; + if (r->mode == TDB_LOCK_MODE_X) return false; + } + } + return true; +} + +/* Move newly-grantable waiters from waiting_head to granted_head. + Caller must hold partition mutex; caller is responsible for broadcasting + the lock's cond after this returns so promoted waiters wake up and link + themselves into their trx->held_locks_head. */ +static void tdb_lock_promote_waiters(tdb_row_lock_t *lock) +{ + while (lock->waiting_head) + { + tdb_lock_request_t *head = lock->waiting_head; + if (!tdb_lock_can_grant(lock, head->mode, head->trx)) break; + lock->waiting_head = head->list_next; + if (!lock->waiting_head) lock->waiting_tail = NULL; + head->list_next = lock->granted_head; + lock->granted_head = head; + head->granted = true; + } +} + +/* Deadlock detection over the wait-for graph. DFS across every + conflicting holder per hop -- the previous single-hop walker followed + only the first conflicting holder and silently missed cycles that + passed through later holders. Frontier is a small fixed-capacity + stack so we keep the same "no allocation under the lock partition + mutex" property the original walker had. + + Lock entries are never freed at runtime so the pointers stored on the + stack and in `visited` are always safe to follow; trx structs outlive + any held lock so holder->waiting_on_lock dereferences stay valid. + Bounded by DEADLOCK_MAX_DEPTH. When the frontier overflows the cap + we return true: a false-positive triggers a cheap retry, while a + false-negative becomes a 50 s lock-wait-timeout stall. */ +static bool tdb_lock_would_deadlock(tidesdb_trx_t *requestor, tdb_row_lock_t *target_lock, + tdb_lock_mode_t want_mode) +{ + struct frame_t + { + tdb_row_lock_t *lock; + tdb_lock_mode_t mode; + }; + /* Cap the frontier at DEADLOCK_MAX_DEPTH*4 so that even highly-fanned + wait-for graphs fit without allocation; pop-order is LIFO so we + still bound the longest single path at DEADLOCK_MAX_DEPTH. */ + constexpr int FRONTIER_CAP = DEADLOCK_MAX_DEPTH * 4; + frame_t frontier[FRONTIER_CAP]; + int top = 0; + frontier[top++] = {target_lock, want_mode}; + + /* Visited set, also fixed capacity. Same overflow contract -- + hitting it means "give up and call it a deadlock". */ + tdb_row_lock_t *visited[FRONTIER_CAP]; + int visited_count = 0; + + int hops = 0; + while (top > 0) + { + if (++hops > DEADLOCK_MAX_DEPTH) return true; + + frame_t f = frontier[--top]; + tdb_row_lock_t *cur_lock = f.lock; + tdb_lock_mode_t cur_mode = f.mode; + + /* Skip locks we've already inspected at this or higher fanout. */ + bool already = false; + for (int i = 0; i < visited_count; i++) + if (visited[i] == cur_lock) + { + already = true; + break; + } + if (already) continue; + if (visited_count >= FRONTIER_CAP) return true; + visited[visited_count++] = cur_lock; + + tdb_lock_partition_t *part = &lock_partitions[cur_lock->partition]; + + mysql_mutex_lock(&part->mutex); + for (tdb_lock_request_t *h = cur_lock->granted_head; h; h = h->list_next) + { + tidesdb_trx_t *holder = h->trx; + if (!holder) continue; + if (tdb_lock_modes_compatible(h->mode, cur_mode)) continue; + if (holder == requestor) + { + mysql_mutex_unlock(&part->mutex); + return true; + } + tdb_row_lock_t *next_lock = holder->waiting_on_lock.load(std::memory_order_acquire); + if (!next_lock) continue; + if (top >= FRONTIER_CAP) + { + mysql_mutex_unlock(&part->mutex); + return true; + } + frontier[top++] = {next_lock, holder->waiting_on_mode}; + } + mysql_mutex_unlock(&part->mutex); + } + return false; +} + +/* + Acquire a row lock in the given mode. Returns 0 on success, an + HA_ERR_* code on failure. Re-entrant for same/weaker mode; rejects + S->X upgrades that would self-deadlock as HA_ERR_LOCK_DEADLOCK. +*/ +static int row_lock_acquire(tidesdb_trx_t *trx, const uchar *key, uint len, THD *thd, + tdb_lock_mode_t mode) +{ + if (!lock_partitions || !trx) return 0; + + uint part_idx = tdb_lock_part(key, len); + tdb_lock_partition_t *part = &lock_partitions[part_idx]; + + mysql_mutex_lock(&part->mutex); + + tdb_row_lock_t *lock = tdb_lock_find_or_create(part, part_idx, key, len); + if (!lock) + { + mysql_mutex_unlock(&part->mutex); + return HA_ERR_OUT_OF_MEM; + } + + /* Re-entry, do we already hold this lock? */ + tdb_lock_request_t *self = tdb_lock_find_self_granted(lock, trx); + if (self) + { + if (self->mode == TDB_LOCK_MODE_X || self->mode == mode) + { + /* X subsumes S; same-mode is identity. */ + mysql_mutex_unlock(&part->mutex); + return 0; + } + /* self->mode == S, want X -- upgrade. Allowed only when we are the + sole granted holder AND no waiters are queued; otherwise we'd + block on ourselves indirectly through our own S-grant. */ + if (lock->granted_head == self && self->list_next == NULL && !lock->waiting_head) + { + self->mode = TDB_LOCK_MODE_X; + mysql_mutex_unlock(&part->mutex); + return 0; + } + mysql_mutex_unlock(&part->mutex); + srv_stat_lock_deadlocks.fetch_add(1, std::memory_order_relaxed); + return HA_ERR_LOCK_DEADLOCK; + } + + /* Fresh request from this trx. */ + if (tdb_lock_can_grant(lock, mode, trx)) + { + tdb_lock_request_t *req = tdb_lock_request_alloc(trx, lock, mode, true); + if (!req) + { + mysql_mutex_unlock(&part->mutex); + return HA_ERR_OUT_OF_MEM; + } + req->list_next = lock->granted_head; + lock->granted_head = req; + req->held_next = trx->held_locks_head; + trx->held_locks_head = req; + mysql_mutex_unlock(&part->mutex); + srv_stat_lock_held.fetch_add(1, std::memory_order_relaxed); + return 0; + } + + /* Need to wait. Append to the lock's FIFO waiting queue and publish + the lock and mode this trx is blocked on so the deadlock walker can + follow the wait-for edge without ever dereferencing a request + struct from another partition. */ + tdb_lock_request_t *req = tdb_lock_request_alloc(trx, lock, mode, false); + if (!req) + { + mysql_mutex_unlock(&part->mutex); + return HA_ERR_OUT_OF_MEM; + } + tdb_lock_waiting_append(lock, req); + trx->waiting_on_mode = mode; + trx->waiting_on_lock.store(lock, std::memory_order_release); + mysql_mutex_unlock(&part->mutex); + + bool deadlock = tdb_lock_would_deadlock(trx, lock, mode); + + mysql_mutex_lock(&part->mutex); + + if (deadlock) + { + /* Between dropping the mutex for the wait-for walk and re-acquiring + it, another transaction's release path may have called + promote_waiters and moved our request from waiting_head onto + granted_head, flipping req->granted to true. In that case the + walker's verdict is based on stale state and the lock is already + ours. Taking the grant is correct and avoids a serious UAF -- + freeing the request while it sits on granted_head would leave a + dangling pointer that the next acquire walks into. */ + if (req->granted) + { + req->held_next = trx->held_locks_head; + trx->held_locks_head = req; + trx->waiting_on_lock.store(NULL, std::memory_order_relaxed); + mysql_mutex_unlock(&part->mutex); + srv_stat_lock_held.fetch_add(1, std::memory_order_relaxed); + return 0; + } + tdb_lock_waiting_remove(lock, req); + trx->waiting_on_lock.store(NULL, std::memory_order_relaxed); + tdb_lock_freelist_if_empty(part, lock); + mysql_mutex_unlock(&part->mutex); + my_free(req); + srv_stat_lock_deadlocks.fetch_add(1, std::memory_order_relaxed); + return HA_ERR_LOCK_DEADLOCK; + } + + /* Holders may have released while we were walking the wait-for graph. + Promote any newly-grantable waiters, then check whether we got our + grant in that pass. */ + tdb_lock_promote_waiters(lock); + + /* Bounded wait until our request is granted, the wait times out, or + the connection is killed. kill_query wakes us by broadcasting on + lock->cond. */ + bool killed = false; + bool timed_out = false; + const ulong timeout_ms = tdb_lock_wait_timeout_ms(thd); + const bool bounded = (timeout_ms > 0); + struct timespec deadline; + if (bounded) set_timespec_nsec(deadline, (ulonglong)timeout_ms * TDB_NS_PER_MS); + + auto wait_t0 = std::chrono::steady_clock::now(); + srv_stat_lock_waits.fetch_add(1, std::memory_order_relaxed); + + while (!req->granted) + { + if (thd && thd_killed(thd)) + { + killed = true; + break; + } + if (bounded) + { + int wrc = mysql_cond_timedwait(&lock->cond, &part->mutex, &deadline); + if (wrc == ETIMEDOUT && !req->granted) + { + timed_out = true; + break; + } + } + else + { + mysql_cond_wait(&lock->cond, &part->mutex); + } + } + + auto wait_us = std::chrono::duration_cast( + std::chrono::steady_clock::now() - wait_t0) + .count(); + srv_stat_lock_wait_us.fetch_add(wait_us, std::memory_order_relaxed); + + if (killed || timed_out) + { + tdb_lock_waiting_remove(lock, req); + trx->waiting_on_lock.store(NULL, std::memory_order_relaxed); + /* Removing us may have unblocked an X behind a string of S + waiters. Re-evaluate and broadcast so any newly-granted + waiter wakes up. */ + tdb_lock_promote_waiters(lock); + bool wake = (lock->waiting_head != NULL) || (lock->granted_head != NULL); + tdb_lock_freelist_if_empty(part, lock); + mysql_mutex_unlock(&part->mutex); + if (wake) mysql_cond_broadcast(&lock->cond); + my_free(req); + if (timed_out) srv_stat_lock_timeouts.fetch_add(1, std::memory_order_relaxed); + return HA_ERR_LOCK_WAIT_TIMEOUT; + } + + /* Granted. tdb_lock_promote_waiters moved us onto granted_head; + link onto trx->held_locks_head and clear waiting_on_lock so the + walker no longer treats this trx as waiting. */ + req->held_next = trx->held_locks_head; + trx->held_locks_head = req; + trx->waiting_on_lock.store(NULL, std::memory_order_relaxed); + mysql_mutex_unlock(&part->mutex); + srv_stat_lock_held.fetch_add(1, std::memory_order_relaxed); + return 0; +} + +/* + Release all row locks held by this transaction. Walks the trx's + held-list of requests, unlinks each from its lock's granted list, + promotes any waiters now compatible with the remaining granted set, + and broadcasts the lock's cond. Called from commit and rollback. +*/ +static void row_locks_release_all(tidesdb_trx_t *trx) +{ + if (!lock_partitions || !trx) return; + + long long released = 0; + tdb_lock_request_t *req = trx->held_locks_head; + while (req) + { + tdb_lock_request_t *next = req->held_next; + tdb_row_lock_t *lock = req->lock; + uint part_idx = lock->partition; + tdb_lock_partition_t *part = &lock_partitions[part_idx]; + + mysql_mutex_lock(&part->mutex); + + /* Unlink req from lock->granted_head. */ + tdb_lock_request_t **pp = &lock->granted_head; + while (*pp && *pp != req) pp = &(*pp)->list_next; + if (*pp == req) *pp = req->list_next; + + /* Promote any waiters now grantable, then wake them up. */ + bool had_waiters = (lock->waiting_head != NULL); + tdb_lock_promote_waiters(lock); + bool promoted_any = had_waiters && (lock->granted_head != NULL); + + /* If nothing references this slot any more, unlink it from the + hash chain and stash it on the partition freelist so the next + acquire can reuse it without growing the chain. Slot memory + is retained across reuse for the deadlock walker. */ + tdb_lock_freelist_if_empty(part, lock); + + mysql_mutex_unlock(&part->mutex); + + if (had_waiters && (promoted_any || lock->waiting_head == NULL)) + mysql_cond_broadcast(&lock->cond); + + my_free(req); + released++; + req = next; + } + trx->held_locks_head = NULL; + trx->waiting_on_lock.store(NULL, std::memory_order_relaxed); + if (released > 0) srv_stat_lock_held.fetch_sub(released, std::memory_order_relaxed); +} + +/* Pick the lock mode for a row materialised on a read path, or report + that no lock is needed. + - write_intent ........ X (covers SELECT FOR UPDATE / UPDATE / DELETE) + - REPEATABLE_READ / SERIALIZABLE ... S (prevents concurrent modification + of read rows within the txn; phantom prevention is incomplete + because we have no range/gap locks, only row locks) + - READ_COMMITTED / SNAPSHOT ... no lock (MVCC snapshot suffices) */ +static inline bool tdb_lock_mode_for_read(THD *thd, bool write_intent, tdb_lock_mode_t *mode) +{ + if (write_intent) + { + *mode = TDB_LOCK_MODE_X; + return true; + } + int iso = thd ? thd_tx_isolation(thd) : ISO_READ_COMMITTED; + if (iso == ISO_REPEATABLE_READ || iso == ISO_SERIALIZABLE) + { + *mode = TDB_LOCK_MODE_S; + return true; + } + return false; +} + +static handler *tidesdb_create_handler(handlerton *hton, TABLE_SHARE *table, MEM_ROOT *mem_root); +static void tidesdb_refresh_status_vars(); + +/* Forward declarations for the tombstone aggregates so tidesdb_show_status + (defined earlier than the storage block) can read them. */ +static long long srv_stat_total_tombstones; +static double srv_stat_tombstone_ratio; +static double srv_stat_max_sst_density; +static long long srv_stat_max_sst_density_level; + +/* File extensions -- TidesDB manages its own files */ +static const char *ha_tidesdb_exts[] = {NullS}; + +/* ******************** Full-Text Search helpers ******************** */ + +/* MariaDB renamed HA_FULLTEXT -> HA_FULLTEXT_legacy after the 11.x series + (flag bit 128 unchanged). Detect via the flag: KEY::algorithm is only set + to HA_KEY_ALG_FULLTEXT on newer servers, and notably not in the ALTER + key_info_buffer on 11.4, so the algorithm-only check missed FULLTEXT adds. */ +#ifndef HA_FULLTEXT +#define HA_FULLTEXT HA_FULLTEXT_legacy +#endif + +static inline bool is_fts_index(const KEY *ki) +{ + return (ki->flags & HA_FULLTEXT) || ki->algorithm == HA_KEY_ALG_FULLTEXT; +} + +/* FTS result entry -- one per matching document */ +struct tdb_fts_result_t +{ + uchar *pk; /* heap-allocated comparable PK bytes */ + uint pk_len; + float rank; /* BM25 score */ +}; + +/* FTS search context returned by ft_init_ext as FT_INFO* */ +struct tdb_ft_info_t +{ + struct _ft_vft *please; /* required by MariaDB FT_INFO layout */ + struct _ft_vft_ext *could_you; /* extended FT API (HA_CAN_FULLTEXT_EXT) */ + ha_tidesdb *handler; /* back-pointer for row fetching */ + uint keynr; /* which FTS index */ + std::vector results; /* sorted by rank descending */ + size_t current_idx; /* iteration position */ + float current_rank; /* rank of last-returned row */ + ulonglong match_count; /* total matches for count_matches() */ +}; + +/* Forward declarations of FT_INFO vtable callbacks */ +static int tdb_fts_read_next(FT_INFO *, char *); +static float tdb_fts_find_relevance(FT_INFO *, uchar *, uint); +static void tdb_fts_close_search(FT_INFO *); +static float tdb_fts_get_relevance(FT_INFO *); +static void tdb_fts_reinit_search(FT_INFO *); + +static const struct _ft_vft tdb_ft_vft = {tdb_fts_read_next, tdb_fts_find_relevance, + tdb_fts_close_search, tdb_fts_get_relevance, + tdb_fts_reinit_search}; + +/* Extended FT API callbacks for HA_CAN_FULLTEXT_EXT */ +static uint tdb_fts_get_version() +{ + return 2; +} + +static ulonglong tdb_fts_get_flags() +{ + return FTS_ORDERED_RESULT; +} + +static ulonglong tdb_fts_get_docid(FT_INFO_EXT *fts) +{ + tdb_ft_info_t *info = reinterpret_cast(fts); + if (info->current_idx > 0 && info->current_idx <= info->results.size()) + return (ulonglong)(info->current_idx); /* 1-based doc ID */ + return 0; +} + +static ulonglong tdb_fts_count_matches(FT_INFO_EXT *fts) +{ + tdb_ft_info_t *info = reinterpret_cast(fts); + return info->match_count; +} + +static struct _ft_vft_ext tdb_ft_vft_ext = {tdb_fts_get_version, tdb_fts_get_flags, + tdb_fts_get_docid, tdb_fts_count_matches}; + +/* FT_INFO vtable callback implementations */ +static int tdb_fts_read_next(FT_INFO *, char *) +{ + return HA_ERR_END_OF_FILE; /* not used -- ft_read() is the entry point */ +} + +static float tdb_fts_find_relevance(FT_INFO *fts, uchar *, uint) +{ + tdb_ft_info_t *info = reinterpret_cast(fts); + return info->current_rank; +} + +static float tdb_fts_get_relevance(FT_INFO *fts) +{ + tdb_ft_info_t *info = reinterpret_cast(fts); + return info->current_rank; +} + +static void tdb_fts_close_search(FT_INFO *fts) +{ + tdb_ft_info_t *info = reinterpret_cast(fts); + for (auto &r : info->results) my_free(r.pk); + delete info; +} + +static void tdb_fts_reinit_search(FT_INFO *fts) +{ + tdb_ft_info_t *info = reinterpret_cast(fts); + info->current_idx = 0; +} + +/* Maximum term byte length in the FTS index. Terms longer than this + are truncated. 512 bytes accommodates even long CJK compound words + (170+ 3-byte UTF-8 characters). */ +static constexpr uint FTS_MAX_TERM_BYTES = 512; + +/* Size of the leading 2-byte little-endian term-length field on every + FTS inverted-index entry key. */ +static constexpr uint FTS_TERM_LEN_PREFIX = 2; + +/* Worst-case FTS entry key buffer-- [2B term_len][term bytes][PK]. */ +static constexpr uint FTS_KEY_BUF_LEN = FTS_TERM_LEN_PREFIX + FTS_MAX_TERM_BYTES + MAX_KEY_LENGTH; + +/* FTS entry value layout-- [2B tf LE][4B doc_len LE] = 6 bytes. */ +static constexpr uint FTS_VALUE_TF_LEN = 2; +static constexpr uint FTS_VALUE_DOC_LEN_OFFSET = FTS_VALUE_TF_LEN; +static constexpr uint FTS_VALUE_DOC_LEN_LEN = 4; +static constexpr uint FTS_VALUE_LEN = FTS_VALUE_TF_LEN + FTS_VALUE_DOC_LEN_LEN; + +/* FTS per-index meta key layout: + [KEY_NS_META(1B)][FTS tag(4B incl NUL)][keynr(1B)] = 6 bytes. + Meta value layout-- [8B total_docs][8B total_words] = 16 bytes. */ +static constexpr const char FTS_META_KEY_TAG[] = "FTS\x00"; +static constexpr uint FTS_META_KEY_TAG_LEN = 4; /* 3 letters + trailing NUL */ +static constexpr uint FTS_META_KEY_TAG_OFFSET = KEY_NAMESPACE_LEN; +static constexpr uint FTS_META_KEY_KEYNR_OFFSET = FTS_META_KEY_TAG_OFFSET + FTS_META_KEY_TAG_LEN; +static constexpr uint FTS_META_KEY_LEN = FTS_META_KEY_KEYNR_OFFSET + 1; +static constexpr uint FTS_META_VALUE_DOCS_LEN = 8; +static constexpr uint FTS_META_VALUE_WORDS_OFFSET = FTS_META_VALUE_DOCS_LEN; +static constexpr uint FTS_META_VALUE_WORDS_LEN = 8; +static constexpr uint FTS_META_VALUE_LEN = FTS_META_VALUE_DOCS_LEN + FTS_META_VALUE_WORDS_LEN; + +/* Build an FTS inverted index key: + [2-byte term_len LE][lowercased term bytes][comparable PK bytes] + Returns total key length. Term is silently truncated to FTS_MAX_TERM_BYTES. */ +static uint fts_build_key(const char *term, uint term_len, const uchar *pk, uint pk_len, uchar *out) +{ + if (term_len > FTS_MAX_TERM_BYTES) term_len = FTS_MAX_TERM_BYTES; + uint pos = 0; + int2store(out + pos, (uint16)term_len); + pos += FTS_TERM_LEN_PREFIX; + memcpy(out + pos, term, term_len); + pos += term_len; + memcpy(out + pos, pk, pk_len); + pos += pk_len; + return pos; +} + +/* Build FTS value ( [2-byte tf LE][4-byte doc_len LE] ) = FTS_VALUE_LEN bytes */ +static uint fts_build_value(uint16 tf, uint32 doc_len, uchar *out) +{ + int2store(out, tf); + int4store(out + FTS_VALUE_DOC_LEN_OFFSET, doc_len); + return FTS_VALUE_LEN; +} + +/* Read or initialize FTS metadata counters from the data CF. + Key format-- [KEY_NS_META][FTS tag][keynr]. + Returns TDB_SUCCESS on a found row, TDB_ERR_NOT_FOUND for a fresh index + (totals zeroed), or the library's error code otherwise. Callers must + not write back a zeroed total derived from a transient read failure -- + that would clobber the real counters and degrade BM25 IDF. */ +static int fts_load_meta(tidesdb_txn_t *txn, tidesdb_column_family_t *data_cf, uint keynr, + int64_t *total_docs, int64_t *total_words) +{ + uchar mk[FTS_META_KEY_LEN]; + mk[0] = KEY_NS_META; + memcpy(mk + FTS_META_KEY_TAG_OFFSET, FTS_META_KEY_TAG, FTS_META_KEY_TAG_LEN); + mk[FTS_META_KEY_KEYNR_OFFSET] = (uchar)keynr; + + uint8_t *val = NULL; + size_t vlen = 0; + *total_docs = 0; + *total_words = 0; + + int rc = tidesdb_txn_get(txn, data_cf, mk, FTS_META_KEY_LEN, &val, &vlen); + if (rc == TDB_SUCCESS && vlen >= FTS_META_VALUE_LEN) + { + *total_docs = sint8korr(val); + *total_words = sint8korr(val + FTS_META_VALUE_WORDS_OFFSET); + tidesdb_free(val); + return TDB_SUCCESS; + } + if (val) tidesdb_free(val); + /* TDB_ERR_NOT_FOUND is the legitimate empty-index case. */ + return rc; +} + +/* Update FTS metadata counters atomically within the current transaction. + thd may be NULL for paths where no session is available (e.g. recovery); + in that case the back-pressure block falls through to the unwrapped put. */ +static int fts_update_meta(THD *thd, tidesdb_txn_t *txn, tidesdb_column_family_t *data_cf, + uint keynr, int64_t delta_docs, int64_t delta_words) +{ + int64_t total_docs = 0, total_words = 0; + /* A transient read failure must not be turned into a zero-based + write-back: zero - delta clamped at 0 would persist garbage and + only manual rebuild would fix BM25. Fresh index (NOT_FOUND) is + the only "no prior value" case we treat as zero. */ + int rrc = fts_load_meta(txn, data_cf, keynr, &total_docs, &total_words); + if (rrc != TDB_SUCCESS && rrc != TDB_ERR_NOT_FOUND) + { + sql_print_error( + "[TIDESDB] fts_update_meta: skipping meta write for keynr=%u " + "because fts_load_meta failed (rc=%d); BM25 totals are unchanged", + keynr, rrc); + return rrc; + } + + total_docs += delta_docs; + total_words += delta_words; + if (total_docs < 0) total_docs = 0; + if (total_words < 0) total_words = 0; + + uchar mk[FTS_META_KEY_LEN]; + mk[0] = KEY_NS_META; + memcpy(mk + FTS_META_KEY_TAG_OFFSET, FTS_META_KEY_TAG, FTS_META_KEY_TAG_LEN); + mk[FTS_META_KEY_KEYNR_OFFSET] = (uchar)keynr; + + uchar mv[FTS_META_VALUE_LEN]; + int8store(mv, total_docs); + int8store(mv + FTS_META_VALUE_WORDS_OFFSET, total_words); + return tdb_txn_put_blocking(thd, txn, data_cf, mk, FTS_META_KEY_LEN, mv, FTS_META_VALUE_LEN, + TIDESDB_TTL_NONE); +} + +/* Fold a per-row FTS meta delta into the txn-level accumulator. Find the + matching (data_cf, keynr) entry and combine, or append a new one. The + list is typically tiny (one or two FTS indexes per touched table), so + linear scan beats a hash. */ +static inline void trx_fts_meta_accumulate(tidesdb_trx_t *trx, tidesdb_column_family_t *cf, + uint keynr, int64_t doc_delta, int64_t word_delta) +{ + if (!trx) return; + for (auto &e : trx->fts_meta_pending) + { + if (e.data_cf == cf && e.keynr == keynr) + { + e.doc_delta += doc_delta; + e.word_delta += word_delta; + trx->fts_meta_dirty = true; + return; + } + } + trx->fts_meta_pending.push_back({cf, keynr, doc_delta, word_delta}); + trx->fts_meta_dirty = true; +} + +/* Apply every accumulated FTS meta delta to its index's meta key inside + the current txn. Called before tidesdb_commit hands the txn to the + library and before maybe_bulk_commit's mid-statement commit so the meta + update is part of the same commit as the row puts that produced it. + Returns a TDB_* error code on the first failure; the accumulator is + cleared in every case since the txn it tracks is about to commit or be + rolled back. */ +static int flush_trx_fts_meta_pending(THD *thd, tidesdb_trx_t *trx) +{ + if (!trx) return TDB_SUCCESS; + if (!trx->fts_meta_dirty || trx->fts_meta_pending.empty() || !trx->txn) + { + trx->fts_meta_pending.clear(); + trx->fts_meta_dirty = false; + return TDB_SUCCESS; + } + int rc = TDB_SUCCESS; + for (const auto &e : trx->fts_meta_pending) + { + rc = fts_update_meta(thd, trx->txn, e.data_cf, e.keynr, e.doc_delta, e.word_delta); + if (rc != TDB_SUCCESS) break; + } + trx->fts_meta_pending.clear(); + trx->fts_meta_dirty = false; + return rc; +} + +/* Tokenize a text string using MariaDB's default FT parser. + Returns lowercased tokens suitable for FTS indexing. */ +struct fts_token_t +{ + std::string word; +}; + +/* Minimum and maximum word length for FTS indexing (in characters). + These mirror InnoDB's innodb_ft_min_token_size / innodb_ft_max_token_size + defaults. Exposed as session variables below for tuning. */ +static ulong srv_fts_min_word_len = 3; +static ulong srv_fts_max_word_len = 84; + +/* Blend characters -- characters that are indexed as both separators and valid + word characters. When a blend char appears inside a token, the tokenizer + emits three tokens -- the full blended form, and the two parts on each side. + For example, with blend_chars="'" and input "l'aria": + -- "l'aria" (full blended token) + -- "l" (left part, may be filtered by min_word_len) + -- "aria" (right part) + This allows Italian/French elision (dell'aria, l'homme) and Irish/Scottish + names (O'Malley) to be searchable by any component or the full form. + Default is empty (no blend characters). Set to "'" for Romance languages. */ +static char *srv_fts_blend_chars = NULL; + +/* Fast lookup table for blend characters, indexed by raw byte value + (covers the full 8-bit range). Rebuilt when the sysvar changes. */ +static constexpr uint TDB_BLEND_MAP_SIZE = 256; +static bool tdb_blend_char_map[TDB_BLEND_MAP_SIZE] = {false}; +static mysql_rwlock_t tdb_blend_lock; +static PSI_rwlock_key tdb_blend_lock_key; + +static void tdb_rebuild_blend_map(const char *chars) +{ + memset(tdb_blend_char_map, 0, sizeof(tdb_blend_char_map)); + if (!chars) return; + for (const char *p = chars; *p; p++) tdb_blend_char_map[(unsigned char)*p] = true; +} + +static void tdb_fts_blend_chars_update(MYSQL_THD thd, struct st_mysql_sys_var *var, void *var_ptr, + const void *save) +{ + const char *new_val = *static_cast(save); + mysql_rwlock_wrlock(&tdb_blend_lock); + tdb_rebuild_blend_map(new_val); + mysql_rwlock_unlock(&tdb_blend_lock); + *static_cast(var_ptr) = new_val; + if (new_val && new_val[0]) + sql_print_information("[TIDESDB] FTS blend_chars set to '%s'", new_val); + else + sql_print_information("[TIDESDB] FTS blend_chars cleared"); +} + +/* Stop word support + Mirrors InnoDB's innodb_ft_server_stopword_table. When NULL, we use the + 36-word default list from information_schema.INNODB_FT_DEFAULT_STOPWORD. + When set to "db/table", we read the 'value' column at next FTS rebuild. + The stop word set is stored in a global unordered_set protected by a + read-mostly rwlock (writes are rare -- only on SET GLOBAL or plugin init). */ +static char *srv_ft_stopword_table = NULL; /* db/table or NULL for defaults */ + +/* InnoDB's default 36 stop words, matching INNODB_FT_DEFAULT_STOPWORD */ +static const char *tdb_default_stopwords[] = { + "a", "about", "an", "are", "as", "at", "be", "by", "com", "de", + "en", "for", "from", "how", "i", "in", "is", "it", "la", "of", + "on", "or", "that", "the", "this", "to", "was", "what", "when", "where", + "who", "will", "with", "und", "the", "www", NULL}; + +static std::unordered_set tdb_stopwords; +static mysql_rwlock_t tdb_stopword_lock; +static PSI_rwlock_key tdb_stopword_lock_key; + +/* Load stop words from the default list */ +static void tdb_load_default_stopwords() +{ + tdb_stopwords.clear(); + for (const char **w = tdb_default_stopwords; *w; w++) tdb_stopwords.insert(*w); +} + +/* Check if a lowercased token is a stop word. + PRECONDITION caller holds tdb_stopword_lock for reading (taken once per + fts_tokenize call to avoid N lock pairs per document). */ +static inline bool tdb_is_stopword_locked(const std::string &word) +{ + return tdb_stopwords.count(word) > 0; +} + +/* Load stop words from a user table specified as "db_name/table_name". + Must be called with tdb_stopword_lock held for writing. + Uses TidesDB's own CF to read the table if it's a TidesDB table, + or falls back to an empty set with a warning for other engines. + For simplicity, the table must store one word per row in a column named 'value' + and be accessible as a TidesDB CF named "db_name__table_name". */ +static bool tdb_load_stopwords_from_table_spec(const char *table_spec) +{ + if (!table_spec || !table_spec[0]) return false; + + const char *slash = strchr(table_spec, '/'); + if (!slash) + { + sql_print_warning( + "[TIDESDB] ft_stopword_table format must be 'db_name/table_name', got '%s'", + table_spec); + return false; + } + + std::string db_name(table_spec, slash - table_spec); + std::string tbl_name(slash + 1); + + /* CF names join the database and table with CF_DB_TABLE_SEP, the same + way path_to_cf_name builds them, so the lookup has to use that + separator rather than the slash from the user-facing spec. */ + std::string cf_name = db_name + CF_DB_TABLE_SEP + tbl_name; + tidesdb_column_family_t *sw_cf = + tdb_global ? tidesdb_get_column_family(tdb_global, cf_name.c_str()) : NULL; + + if (!sw_cf) + { + sql_print_warning( + "[TIDESDB] Stop word table '%s' not found as TidesDB CF '%s'. " + "The table must be a TidesDB ENGINE table. Keeping current stop words.", + table_spec, cf_name.c_str()); + return false; + } + + /* We scan the CF for all keys with DATA namespace prefix. + Each row should have a 'value' field which we extract via full table scan. */ + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return false; + + tidesdb_iter_t *iter = NULL; + if (tdb_iter_new_blocking(current_thd, txn, sw_cf, &iter) != TDB_SUCCESS) + { + tidesdb_txn_free(txn); + return false; + } + + tidesdb_iter_seek_to_first(iter); + tdb_stopwords.clear(); + + while (tidesdb_iter_valid(iter)) + { + uint8_t *val = NULL; + size_t val_size = 0; + if (tidesdb_iter_value(iter, &val, &val_size) == TDB_SUCCESS && val && + val_size > ROW_HEADER_SIZE && val[0] == ROW_HEADER_MAGIC) + { + /* The row carries the self-describing header written by + serialize_row, so the null bitmap width is read from the + header rather than assumed. After the header and the bitmap + a single-column table holds just the one packed VARCHAR. */ + uint stored_null_bytes = uint2korr(val + 1); + size_t off = (size_t)ROW_HEADER_SIZE + stored_null_bytes; + if (off < val_size) + { + const uint8_t *data = val + off; + size_t data_len = val_size - off; + + /* Field::pack stores a VARCHAR with a one-byte length prefix + when the column is at most 255 chars wide and a two-byte + prefix otherwise. The packed field of a single-column row + spans the whole remaining buffer, so the prefix width is + the one whose recorded length consumes exactly the rest. */ + uint prefix = 0; + size_t str_len = 0; + if (data_len >= 1 && (size_t)data[0] + 1 == data_len) + { + prefix = 1; + str_len = data[0]; + } + else if (data_len >= FIELD_VARCHAR_LEN_PREFIX && + (size_t)uint2korr(data) + FIELD_VARCHAR_LEN_PREFIX == data_len) + { + prefix = FIELD_VARCHAR_LEN_PREFIX; + str_len = uint2korr(data); + } + if (prefix && str_len > 0) + { + std::string word((const char *)(data + prefix), str_len); + std::transform(word.begin(), word.end(), word.begin(), ::tolower); + tdb_stopwords.insert(std::move(word)); + } + } + } + tidesdb_iter_next(iter); + } + + tidesdb_iter_free(iter); + tidesdb_txn_free(txn); + + sql_print_information("[TIDESDB] Loaded %zu stop words from table '%s'", tdb_stopwords.size(), + table_spec); + return true; +} + +/* Sysvar update callback for tidesdb_ft_stopword_table */ +static void tdb_ft_stopword_table_update(MYSQL_THD thd, struct st_mysql_sys_var *var, void *var_ptr, + const void *save) +{ + const char *new_val = *static_cast(save); + mysql_rwlock_wrlock(&tdb_stopword_lock); + + if (!new_val || !new_val[0]) + { + /* NULL or empty string -- we reset to defaults */ + tdb_load_default_stopwords(); + sql_print_information("[TIDESDB] Stop words reset to defaults (%zu words)", + tdb_stopwords.size()); + } + else + { + if (!tdb_load_stopwords_from_table_spec(new_val)) + { + sql_print_warning("[TIDESDB] Failed to load stop words from '%s', keeping current set", + new_val); + } + } + + *static_cast(var_ptr) = new_val; + mysql_rwlock_unlock(&tdb_stopword_lock); +} + +/* BM25 tuning parameters. k1 controls term-frequency saturation + (higher = more weight to repeated terms). b controls document-length + normalization (0 = no normalization, 1 = full normalization). */ +static double srv_fts_bm25_k1 = 1.2; +static double srv_fts_bm25_b = 0.75; + +/* Helper to lowercase, check stop words, length filter, and emit a token */ +static inline void fts_emit_token(const char *word_start, size_t byte_len, uint char_count, + CHARSET_INFO *cs, std::vector &out) +{ + if (char_count < srv_fts_min_word_len || char_count > srv_fts_max_word_len) return; + + fts_token_t tok; + tok.word.assign(word_start, byte_len); + size_t lowered_len = + cs->cset->casedn(cs, &tok.word[0], tok.word.size(), &tok.word[0], tok.word.size()); + tok.word.resize(lowered_len); + + if (tdb_is_stopword_locked(tok.word)) return; + out.push_back(std::move(tok)); +} + +/* Charset-aware tokenizer with blend character support. + Uses MariaDB's charset API to correctly handle multi-byte characters + (UTF-8, UTF-16, CJK character sets, etc.). Splits on word boundaries + using the charset's ctype classification, lowercases using the charset's + case-folding tables, and filters by configurable word length bounds. + + Blend characters (configured via tidesdb_fts_blend_chars) are treated as + both word characters and separators. When a blend char appears inside a + token, the tokenizer emits three forms-- the full blended token, and the + two parts on each side of the blend char. This enables Romance language + elision (l'aria -> l'aria + aria) and names (O'Malley -> o'malley + malley) + to be searchable by any component or the full form. */ +static void fts_tokenize(const char *text, size_t text_len, CHARSET_INFO *cs, + std::vector &out) +{ + const char *p = text; + const char *end = text + text_len; + uint mblen; + + /* We snapshot blend chars under read lock once per tokenize call */ + bool has_blend = false; + bool blend_map_copy[TDB_BLEND_MAP_SIZE]; + { + mysql_rwlock_rdlock(&tdb_blend_lock); + memcpy(blend_map_copy, tdb_blend_char_map, sizeof(blend_map_copy)); + mysql_rwlock_unlock(&tdb_blend_lock); + for (uint i = 0; i < TDB_BLEND_MAP_SIZE && !has_blend; i++) + if (blend_map_copy[i]) has_blend = true; + } + + /* We hold the stopword rdlock once for the whole tokenize pass. + fts_emit_token calls tdb_is_stopword_locked which assumes the read + lock is held -- this avoids the N lock-pair cost the previous + per-token acquisition incurred (1000-word doc = 1000 lock pairs). */ + mysql_rwlock_rdlock(&tdb_stopword_lock); + + while (p < end) + { + while (p < end) + { + mblen = my_ismbchar(cs, p, end); + if (mblen) break; /* multi-byte = word char */ + if (my_isalnum(cs, (uchar)*p)) break; + if (has_blend && blend_map_copy[(uchar)*p]) break; + p++; + } + if (p >= end) break; + + const char *word_start = p; + uint char_count = 0; + bool contains_blend = false; + + while (p < end) + { + mblen = my_ismbchar(cs, p, end); + if (mblen) + { + p += mblen; + char_count++; + continue; + } + if (my_isalnum(cs, (uchar)*p)) + { + p++; + char_count++; + continue; + } + if (has_blend && blend_map_copy[(uchar)*p]) + { + contains_blend = true; + p++; + char_count++; + continue; + } + break; + } + size_t byte_len = (size_t)(p - word_start); + + if (!contains_blend) + { + fts_emit_token(word_start, byte_len, char_count, cs, out); + } + else + { + /* Blend char found -- emit full blended token plus sub-parts. + We split on blend chars and emit each sub-part that meets + the minimum length requirement. */ + fts_emit_token(word_start, byte_len, char_count, cs, out); + + const char *sub_start = word_start; + uint sub_chars = 0; + for (const char *s = word_start; s < word_start + byte_len; s++) + { + if (blend_map_copy[(uchar)*s]) + { + size_t sub_len = (size_t)(s - sub_start); + if (sub_len > 0) fts_emit_token(sub_start, sub_len, sub_chars, cs, out); + sub_start = s + 1; + sub_chars = 0; + } + else + { + sub_chars++; + } + } + size_t sub_len = (size_t)((word_start + byte_len) - sub_start); + if (sub_len > 0) fts_emit_token(sub_start, sub_len, sub_chars, cs, out); + } + } + + mysql_rwlock_unlock(&tdb_stopword_lock); +} + +/* Extract and tokenize the document from all FULLTEXT key_part fields. + Returns the token list and word count. */ +static void fts_extract_and_tokenize(TABLE *table, const KEY *key_info, const uchar *record, + CHARSET_INFO *cs, std::vector &out_tokens) +{ + std::string doc; + my_ptrdiff_t ptrdiff = (my_ptrdiff_t)(record - table->record[0]); + + for (uint p = 0; p < key_info->user_defined_key_parts; p++) + { + Field *f = key_info->key_part[p].field; + if (ptrdiff) f->move_field_offset(ptrdiff); + if (!f->is_null()) + { + String val; + f->val_str(&val); + if (!doc.empty()) doc += ' '; + doc.append(val.ptr(), val.length()); + } + if (ptrdiff) f->move_field_offset(-ptrdiff); + } + + fts_tokenize(doc.data(), doc.size(), cs, out_tokens); +} + +/* Boolean query term with yesno/trunc/phrase flags from the parser */ +struct fts_query_term_t +{ + std::string term; + int yesno; /* FTS_TERM_REQUIRED / FTS_TERM_EXCLUDED / FTS_TERM_NEUTRAL */ + bool trunc; /* prefix match (wildcard) */ + bool is_phrase; + std::vector phrase_words; +}; + +/* Boolean query parser. + Handles +required -excluded word* (truncated), "exact phrase", plain terms. + Charset-aware uses multi-byte character scanning for word boundaries. */ +static void fts_parse_boolean(const char *query, size_t len, CHARSET_INFO *cs, + std::vector &out) +{ + const char *p = query; + const char *end = query + len; + + while (p < end) + { + while (p < end && *p == ' ') p++; + if (p >= end) break; + + int yesno = FTS_TERM_NEUTRAL; + if (*p == FTS_BOOL_OP_REQUIRED) + { + yesno = FTS_TERM_REQUIRED; + p++; + } + else if (*p == FTS_BOOL_OP_EXCLUDED) + { + yesno = FTS_TERM_EXCLUDED; + p++; + } + + while (p < end && *p == ' ') p++; + if (p >= end) break; + + /* "word1 word2 word3" */ + if (*p == FTS_BOOL_OP_PHRASE) + { + p++; /* skip opening quote */ + const char *phrase_start = p; + while (p < end && *p != FTS_BOOL_OP_PHRASE) p++; + size_t phrase_len = (size_t)(p - phrase_start); + if (p < end) p++; /* skip closing quote */ + + if (phrase_len == 0) continue; + + std::vector phrase_tokens; + fts_tokenize(phrase_start, phrase_len, cs, phrase_tokens); + if (phrase_tokens.empty()) continue; + + /* Each phrase word becomes a required term for candidate filtering. + The first word carries the phrase metadata for verification. */ + fts_query_term_t qt; + qt.term = phrase_tokens[0].word; + qt.yesno = yesno ? yesno : FTS_TERM_REQUIRED; /* phrases are implicitly required */ + qt.trunc = false; + qt.is_phrase = true; + for (auto &tok : phrase_tokens) + qt.phrase_words.push_back(tok.word); /* copy, don't move */ + + /* Also add the remaining phrase words as required terms so the + candidate set is narrowed before phrase verification */ + out.push_back(std::move(qt)); + for (size_t i = 1; i < phrase_tokens.size(); i++) + { + fts_query_term_t wt; + wt.term = phrase_tokens[i].word; + wt.yesno = FTS_TERM_REQUIRED; + wt.trunc = false; + wt.is_phrase = false; + out.push_back(std::move(wt)); + } + continue; + } + + while (p < end && !my_isalnum(cs, (uchar)*p) && !my_ismbchar(cs, p, end) && + *p != FTS_BOOL_OP_TRUNC) + p++; + if (p >= end) break; + + const char *word_start = p; + while (p < end) + { + uint mblen = my_ismbchar(cs, p, end); + if (mblen) + { + p += mblen; + continue; + } + if (my_isalnum(cs, (uchar)*p) || *p == FTS_BOOL_OP_TRUNC) + { + p++; + continue; + } + break; + } + size_t wlen = (size_t)(p - word_start); + if (wlen == 0) continue; + + bool trunc = false; + if (wlen > 0 && word_start[wlen - 1] == FTS_BOOL_OP_TRUNC) + { + trunc = true; + wlen--; + } + if (wlen == 0) continue; + + fts_query_term_t qt; + qt.term.assign(word_start, wlen); + size_t lowered = + cs->cset->casedn(cs, &qt.term[0], qt.term.size(), &qt.term[0], qt.term.size()); + qt.term.resize(lowered); + qt.yesno = yesno; + qt.trunc = trunc; + qt.is_phrase = false; + out.push_back(std::move(qt)); + } +} + +/* Verify that a phrase appears as a consecutive subsequence within an + already-tokenized document. Callers tokenize a candidate once and check + many phrases against the same token vector. */ +static bool fts_phrase_in_tokens(const std::vector &doc_tokens, + const std::vector &phrase_words) +{ + if (phrase_words.empty()) return true; + if (doc_tokens.size() < phrase_words.size()) return false; + + size_t limit = doc_tokens.size() - phrase_words.size(); + for (size_t i = 0; i <= limit; i++) + { + bool match = true; + for (size_t j = 0; j < phrase_words.size(); j++) + { + if (doc_tokens[i + j].word != phrase_words[j]) + { + match = false; + break; + } + } + if (match) return true; + } + return false; +} + +/* ******************** Spatial Index helpers ******************** */ + +/* MariaDB renamed HA_SPATIAL -> HA_SPATIAL_legacy after the 11.x series; the + key-flag bit (1024) is unchanged. Spatial keys reliably carry this flag on + every version, whereas KEY::algorithm is only set to HA_KEY_ALG_RTREE on + newer servers (it is HA_KEY_ALG_UNDEF on 11.4), so detect via the flag. */ +#ifndef HA_SPATIAL +#define HA_SPATIAL HA_SPATIAL_legacy +#endif + +static inline bool is_spatial_index(const KEY *ki) +{ + return (ki->flags & HA_SPATIAL) || ki->algorithm == HA_KEY_ALG_RTREE; +} + +/* MBR (Minimum Bounding Rectangle) for spatial predicates */ +struct tdb_mbr_t +{ + double xmin, ymin, xmax, ymax; +}; + +/* Hilbert curve constants */ +static constexpr uint HILBERT_ORDER = 32; /* bits per axis */ +static constexpr uint HILBERT_DIM = 2; /* 2D curve (x, y) */ +static constexpr uint64_t HILBERT_N = (uint64_t)1 << HILBERT_ORDER; /* 2^32 */ +static constexpr uint SPATIAL_HILBERT_KEY_LEN = 8; /* 64-bit Hilbert value */ +static constexpr uint SPATIAL_MBR_VALUE_LEN = 32; /* 4 doubles */ + +/* Convert IEEE 754 double to a uint32 that preserves sort order under + unsigned integer comparison. Handles negative values correctly by + flipping all bits (negative doubles have sign bit set in IEEE 754; + flipping makes them sort before positive values). */ +static inline uint32_t double_to_lex_uint32(double val) +{ + uint64_t bits; + memcpy(&bits, &val, sizeof(bits)); + if (bits & IEEE754_DOUBLE_SIGN_MASK) + bits = ~bits; /* negative, flip all bits */ + else + bits ^= IEEE754_DOUBLE_SIGN_MASK; /* positive, flip sign bit only */ + return (uint32_t)(bits >> LEX_UINT32_HI_SHIFT); /* top 32 bits for precision */ +} + +/* Hilbert curve, rotate quadrant coordinates. Implements the inner + rotation step of the iterative xy2d transform from Skilling 2004 + ("Programming the Hilbert curve") -- see also the canonical + Wikipedia pseudocode at https://en.wikipedia.org/wiki/Hilbert_curve. + The literal (n - 1) is the standard reflection around the centre + of an n-cell axis; rx and ry carry the binary quadrant flags from + the caller. */ +static inline void hilbert_rot(uint32_t n, uint32_t *x, uint32_t *y, uint32_t rx, uint32_t ry) +{ + if (ry == 0) + { + if (rx == 1) + { + *x = n - 1 - *x; + *y = n - 1 - *y; + } + uint32_t t = *x; + *x = *y; + *y = t; + } +} + +/* Convert 2D coordinates (x, y) to a 64-bit Hilbert curve value. Order + 32, each axis 32-bit precision, output 64-bit. Iterative algorithm + per Skilling 2004 / Wikipedia, O(32) loop, no recursion. The literal + `3` and the XOR encode the four-quadrant visit order of the Hilbert + d-value (rx, ry) = (0,0)->0, (0,1)->1, (1,1)->2, (1,0)->3. The + `s << 1` doubles s so hilbert_rot receives the full sub-grid size + for this level, not the half-size step. */ +static uint64_t hilbert_xy2d_64(uint32_t x, uint32_t y) +{ + uint64_t d = 0; + for (uint64_t s = HILBERT_N >> 1; s > 0; s >>= 1) + { + uint32_t rx = (x & s) > 0 ? 1 : 0; + uint32_t ry = (y & s) > 0 ? 1 : 0; + d += s * s * (uint64_t)((3 * rx) ^ ry); + hilbert_rot((uint32_t)s << 1, &x, &y, rx, ry); + } + return d; +} + +/* Store uint64 as 8-byte big-endian (for lexicographic ordering in LSM). + Most significant byte first so that memcmp on the encoded bytes matches + the natural numeric ordering of the Hilbert value. */ +static inline void encode_hilbert_be(uint64_t h, uchar *out) +{ + for (uint i = 0; i < SPATIAL_HILBERT_KEY_LEN; i++) + out[i] = (uchar)(h >> ((SPATIAL_HILBERT_KEY_LEN - 1 - i) * BITS_PER_BYTE)); +} + +/* Decode 8-byte big-endian uint64 */ +static inline uint64_t decode_hilbert_be(const uchar *in) +{ + uint64_t h = 0; + for (uint i = 0; i < SPATIAL_HILBERT_KEY_LEN; i++) h = (h << BITS_PER_BYTE) | (uint64_t)in[i]; + return h; +} + +/* WKB geometry type constants */ +static constexpr uint32_t WKB_POINT = 1; +static constexpr uint32_t WKB_LINESTRING = 2; +static constexpr uint32_t WKB_POLYGON = 3; +static constexpr uint32_t WKB_MULTIPOINT = 4; +static constexpr uint32_t WKB_MULTILINESTRING = 5; +static constexpr uint32_t WKB_MULTIPOLYGON = 6; +static constexpr uint32_t WKB_GEOMETRYCOLLECTION = 7; + +/* Limits to reject malformed WKB data */ +static constexpr uint32_t WKB_MAX_POINTS = 1000000; +static constexpr uint32_t WKB_MAX_RINGS = 10000; +static constexpr uint32_t WKB_MAX_GEOMS = 100000; +static constexpr uint SPATIAL_SRID_SIZE = 4; +static constexpr uint SPATIAL_WKB_HEADER_SIZE = 5; /* 1 byte_order + 4 type */ +static constexpr uint SPATIAL_POINT_DATA_SIZE = 16; /* 2 doubles (x, y) */ + +/* WKB encodes its count fields (point/ring/geometry counts) as uint32_t. */ +static constexpr uint WKB_COUNT_SIZE = sizeof(uint32_t); + +/* Parts-of-MBR encoding. spatial_build_value writes [xmin,ymin,xmax,ymax] + as native doubles, and spatial_parse_query_mbr reads MariaDB's + [xmin,xmax,ymin,ymax] layout. Sentinels for offset arithmetic. */ +static constexpr uint MBR_DOUBLE_SIZE = sizeof(double); +static constexpr uint MBR_OFFSET_SECOND = 1 * MBR_DOUBLE_SIZE; +static constexpr uint MBR_OFFSET_THIRD = 2 * MBR_DOUBLE_SIZE; +static constexpr uint MBR_OFFSET_FOURTH = 3 * MBR_DOUBLE_SIZE; + +/* Read a coordinate pair from WKB and expand MBR. + Advances pp by SPATIAL_POINT_DATA_SIZE bytes. + Skips NaN/Inf coordinates. */ +static inline bool wkb_read_point(const uchar *&pp, const uchar *ee, double &mn_x, double &mn_y, + double &mx_x, double &mx_y) +{ + if (pp + SPATIAL_POINT_DATA_SIZE > ee) return false; + double x, y; + float8get(x, pp); + float8get(y, pp + MBR_DOUBLE_SIZE); + pp += SPATIAL_POINT_DATA_SIZE; + if (std::isfinite(x) && std::isfinite(y)) + { + if (x < mn_x) mn_x = x; + if (x > mx_x) mx_x = x; + if (y < mn_y) mn_y = y; + if (y > mx_y) mx_y = y; + } + return true; +} + +/* Read a point sequence ([num_points 4B][x,y pairs...]) and expand MBR. + Used by LINESTRING and each POLYGON ring. */ +static inline bool wkb_read_point_sequence(const uchar *&pp, const uchar *ee, double &mn_x, + double &mn_y, double &mx_x, double &mx_y) +{ + if (pp + WKB_COUNT_SIZE > ee) return false; + uint32_t n_pts; + memcpy(&n_pts, pp, WKB_COUNT_SIZE); + pp += WKB_COUNT_SIZE; + if (n_pts > WKB_MAX_POINTS) return false; + for (uint32_t i = 0; i < n_pts; i++) + { + if (!wkb_read_point(pp, ee, mn_x, mn_y, mx_x, mx_y)) return false; + } + return true; +} + +/* Maximum nesting depth for a GEOMETRYCOLLECTION (or any of the MULTI + types). Stops a pathologically nested geometry from blowing the stack + through wkb_parse_geometry's recursion; far above any real geometry + the server would actually accept. */ +static constexpr int WKB_MAX_RECURSION_DEPTH = 32; + +/* Recursive WKB geometry parser. Reads one geometry object from pp, + expanding the MBR to include all coordinate pairs. Advances pp past + the consumed bytes. Supports all 7 OGC geometry types. The depth + argument bounds recursive descent into GEOMETRYCOLLECTION children. */ +static bool wkb_parse_geometry(const uchar *&pp, const uchar *ee, double &mn_x, double &mn_y, + double &mx_x, double &mx_y, int depth) +{ + if (depth > WKB_MAX_RECURSION_DEPTH) return false; + if (pp + SPATIAL_WKB_HEADER_SIZE > ee) return false; + /* MariaDB stores WKB in native byte order, so the leading byte is the + native endianness marker (0 = big, 1 = little). We rely on native + order for the memcpy reads of the geometry type and coordinates + below; if MariaDB ever changed to store non-native WKB, this assert + would fire instead of silently returning garbage MBRs. Release + builds simply trust the convention. */ +#ifndef DBUG_OFF + { + const uint32_t endian_probe = 1; + uchar native_byte_order = *(const uchar *)&endian_probe; /* 1 on LE, 0 on BE */ + DBUG_ASSERT(*pp == native_byte_order); + } +#endif + pp++; /* we skip byte_order (MariaDB stores in native order) */ + uint32_t gt; + memcpy(>, pp, WKB_COUNT_SIZE); + pp += WKB_COUNT_SIZE; + + switch (gt) + { + case WKB_POINT: + return wkb_read_point(pp, ee, mn_x, mn_y, mx_x, mx_y); + + case WKB_LINESTRING: + return wkb_read_point_sequence(pp, ee, mn_x, mn_y, mx_x, mx_y); + + case WKB_POLYGON: + { + if (pp + WKB_COUNT_SIZE > ee) return false; + uint32_t n_rings; + memcpy(&n_rings, pp, WKB_COUNT_SIZE); + pp += WKB_COUNT_SIZE; + if (n_rings > WKB_MAX_RINGS) return false; + for (uint32_t r = 0; r < n_rings; r++) + { + if (!wkb_read_point_sequence(pp, ee, mn_x, mn_y, mx_x, mx_y)) return false; + } + return true; + } + + case WKB_MULTIPOINT: + case WKB_MULTILINESTRING: + case WKB_MULTIPOLYGON: + case WKB_GEOMETRYCOLLECTION: + { + if (pp + WKB_COUNT_SIZE > ee) return false; + uint32_t n_geoms; + memcpy(&n_geoms, pp, WKB_COUNT_SIZE); + pp += WKB_COUNT_SIZE; + if (n_geoms > WKB_MAX_GEOMS) return false; + for (uint32_t i = 0; i < n_geoms; i++) + { + if (!wkb_parse_geometry(pp, ee, mn_x, mn_y, mx_x, mx_y, depth + 1)) return false; + } + return true; + } + + default: + return false; + } +} + +/* Extract MBR from a GEOMETRY field's raw data (SRID prefix + WKB). + Supports all OGC geometry types. Rejects malformed data and + coordinates with NaN/Inf values. + Returns true on success, false on malformed data. */ +static bool spatial_compute_mbr(const uchar *data, size_t len, double *xmin, double *ymin, + double *xmax, double *ymax) +{ + if (len < SPATIAL_SRID_SIZE + SPATIAL_WKB_HEADER_SIZE) return false; + + const uchar *p = data + SPATIAL_SRID_SIZE; + const uchar *end = data + len; + + *xmin = *ymin = DBL_MAX; + *xmax = *ymax = -DBL_MAX; + + if (!wkb_parse_geometry(p, end, *xmin, *ymin, *xmax, *ymax, 0)) return false; + + return *xmin <= *xmax && *ymin <= *ymax; +} + +/* Build spatial index key ( [hilbert_value 8B BE][pk_bytes] ) + Returns total key length. */ +static uint spatial_build_key(double cx, double cy, const uchar *pk, uint pk_len, uchar *out) +{ + uint32_t qx = double_to_lex_uint32(cx); + uint32_t qy = double_to_lex_uint32(cy); + uint64_t h = hilbert_xy2d_64(qx, qy); + encode_hilbert_be(h, out); + memcpy(out + SPATIAL_HILBERT_KEY_LEN, pk, pk_len); + return SPATIAL_HILBERT_KEY_LEN + pk_len; +} + +/* Build spatial index value( [xmin 8B][ymin 8B][xmax 8B][ymax 8B] ) = 32 bytes. + Stored as native doubles (little-endian on x86). */ +static void spatial_build_value(double xmin, double ymin, double xmax, double ymax, uchar *out) +{ + memcpy(out, &xmin, MBR_DOUBLE_SIZE); + memcpy(out + MBR_OFFSET_SECOND, &ymin, MBR_DOUBLE_SIZE); + memcpy(out + MBR_OFFSET_THIRD, &xmax, MBR_DOUBLE_SIZE); + memcpy(out + MBR_OFFSET_FOURTH, &ymax, MBR_DOUBLE_SIZE); +} + +/* Parse MBR from MariaDB's spatial key buffer. + MariaDB format( [xmin 8B][xmax 8B][ymin 8B][ymax 8B] ). A malformed + key whose stored min exceeds its max would underflow the grid-cell + subtraction in spatial_decompose_ranges and ask reserve for a billion + slots, so the corners are normalised here at the parse boundary. */ +static void spatial_parse_query_mbr(const uchar *key, tdb_mbr_t *mbr) +{ + float8get(mbr->xmin, key); + float8get(mbr->xmax, key + MBR_OFFSET_SECOND); + float8get(mbr->ymin, key + MBR_OFFSET_THIRD); + float8get(mbr->ymax, key + MBR_OFFSET_FOURTH); + if (mbr->xmin > mbr->xmax) std::swap(mbr->xmin, mbr->xmax); + if (mbr->ymin > mbr->ymax) std::swap(mbr->ymin, mbr->ymax); +} + +/* MBR spatial predicates -- match MariaDB MBR class semantics exactly */ +static inline bool mbr_intersects(const tdb_mbr_t *a, const tdb_mbr_t *b) +{ + return !(a->xmax < b->xmin || a->xmin > b->xmax || a->ymax < b->ymin || a->ymin > b->ymax); +} + +static inline bool mbr_within(const tdb_mbr_t *a, const tdb_mbr_t *b) +{ + return a->xmin >= b->xmin && a->xmax <= b->xmax && a->ymin >= b->ymin && a->ymax <= b->ymax; +} + +static inline bool mbr_equals(const tdb_mbr_t *a, const tdb_mbr_t *b) +{ + return a->xmin == b->xmin && a->xmax == b->xmax && a->ymin == b->ymin && a->ymax == b->ymax; +} + +static inline bool mbr_disjoint(const tdb_mbr_t *a, const tdb_mbr_t *b) +{ + return !mbr_intersects(a, b); +} + +/* Dispatch MBR predicate based on ha_rkey_function spatial mode. + Returns true if the entry MBR matches the query predicate. */ +static bool spatial_mbr_predicate(enum ha_rkey_function mode, const tdb_mbr_t *query, + const tdb_mbr_t *entry) +{ + /* MariaDB's CONTAIN and WITHIN both reduce to "row MBR is within the + query MBR" once the SQL-layer argument order is normalised, so they + map to the same mbr_within(entry, query) call below. Intersect is + symmetric. */ + switch (mode) + { + case HA_READ_MBR_INTERSECT: + return mbr_intersects(entry, query); + case HA_READ_MBR_CONTAIN: + return mbr_within(entry, query); + case HA_READ_MBR_WITHIN: + return mbr_within(entry, query); + case HA_READ_MBR_EQUAL: + return mbr_equals(entry, query); + case HA_READ_MBR_DISJOINT: + return mbr_disjoint(entry, query); + default: + return false; + } +} + +/* Hilbert range decomposition resolution. At SPATIAL_DECOMP_BITS bits per + axis, the coordinate space is divided into a 2^N x 2^N grid. Higher + values produce tighter ranges (fewer false positives) but more ranges + to scan (more seeks). 8 bits = 256x256 grid, at most 65536 cells but + typically 10-50 merged ranges for a small query box. */ +static constexpr uint SPATIAL_DECOMP_BITS = 8; +static constexpr uint SPATIAL_DECOMP_N = 1u << SPATIAL_DECOMP_BITS; +static_assert(SPATIAL_DECOMP_BITS < HILBERT_ORDER, + "SPATIAL_DECOMP_BITS must be < HILBERT_ORDER or shift underflows"); + +/* Cell-count cap. Above this we fall back to a single full-range scan; + the MBR post-filter rejects non-overlapping rows on the read side, so + the only cost is reading more keys, not returning wrong rows. Without + this cap a query MBR covering the whole universe allocates ~512 KB of + uint64_t for the cells vector and then sorts it -- the full-scan path + does that work for free. */ +static constexpr uint SPATIAL_DECOMP_FULL_SCAN_THRESHOLD = SPATIAL_DECOMP_N * 16; + +/* Compute the Hilbert ranges that cover a quantized bounding box. + Enumerates grid cells at SPATIAL_DECOMP_BITS resolution, computes + the Hilbert value for each, sorts, and merges contiguous values + into non-overlapping ranges. Each range maps back to the full + 32-bit Hilbert space by shifting. */ +static void spatial_decompose_ranges(uint32_t qx_min, uint32_t qy_min, uint32_t qx_max, + uint32_t qy_max, + std::vector> &out) +{ + out.clear(); + + uint shift = HILBERT_ORDER - SPATIAL_DECOMP_BITS; + uint gx0 = qx_min >> shift; + uint gy0 = qy_min >> shift; + uint gx1 = qx_max >> shift; + uint gy1 = qy_max >> shift; + + if (gx1 >= SPATIAL_DECOMP_N) gx1 = SPATIAL_DECOMP_N - 1; + if (gy1 >= SPATIAL_DECOMP_N) gy1 = SPATIAL_DECOMP_N - 1; + + /* Wide query box. Falling back to a single full-range scan beats + enumerating + sorting 65k cells when the post-filter is going to + reject most of them anyway. */ + const uint64_t cell_count = (uint64_t)(gx1 - gx0 + 1) * (uint64_t)(gy1 - gy0 + 1); + if (cell_count > SPATIAL_DECOMP_FULL_SCAN_THRESHOLD) + { + out.push_back({HILBERT_RANGE_FULL_LO, HILBERT_RANGE_FULL_HI}); + return; + } + + std::vector cells; + cells.reserve((size_t)cell_count); + for (uint gx = gx0; gx <= gx1; gx++) + { + for (uint gy = gy0; gy <= gy1; gy++) + { + /* We compute coarse hilbert value and scale to full 64-bit space. + The coarse cell (gx, gy) at SPATIAL_DECOMP_BITS resolution + maps to hilbert values in [h_coarse << (2*shift), (h_coarse+1) << (2*shift) - 1] */ + uint64_t h = hilbert_xy2d_64(gx << shift, gy << shift); + cells.push_back(h); + } + } + + if (cells.empty()) + { + /* Degenerate query box -- fall back to a full scan. */ + out.push_back({HILBERT_RANGE_FULL_LO, HILBERT_RANGE_FULL_HI}); + return; + } + + std::sort(cells.begin(), cells.end()); + + /* Each coarse cell covers a range of 2^(HILBERT_DIM*shift) fine hilbert + values, er shift bits per axis times HILBERT_DIM axes. */ + uint64_t cell_span = (uint64_t)1 << (HILBERT_DIM * shift); + + uint64_t range_lo = cells[0]; + uint64_t range_hi = cells[0] + cell_span - 1; + + for (size_t i = 1; i < cells.size(); i++) + { + uint64_t lo = cells[i]; + uint64_t hi = cells[i] + cell_span - 1; + + if (lo <= range_hi + 1) + { + if (hi > range_hi) range_hi = hi; + } + else + { + out.push_back({range_lo, range_hi}); + range_lo = lo; + range_hi = hi; + } + } + out.push_back({range_lo, range_hi}); +} + +/* ******************** System variables (global DB config) ******************** */ + +static ulong srv_flush_threads = 4; +static ulong srv_max_concurrent_flushes = 0; /* 0 = align with srv_flush_threads */ +static ulong srv_compaction_threads = 4; +static ulong srv_log_level = 0; /* TDB_LOG_DEBUG */ +static ulonglong srv_block_cache_size = TIDESDB_DEFAULT_BLOCK_CACHE; /* 256M */ +static ulong srv_max_open_sstables = 256; +static ulonglong srv_max_memory_usage = 0; /* 0 = auto (library decides) */ +static my_bool srv_log_to_file = 1; /* write TidesDB logs to file (default is yes) */ +static ulonglong srv_log_truncation_at = 24ULL * 1024 * 1024; /* log file truncation size (24MB) */ +static my_bool srv_unified_memtable = 1; /* 1 = unified WAL+memtable (default), 0 = per-CF */ +static ulonglong srv_unified_memtable_write_buffer_size = 256ULL * 1024 * 1024; /* 256MB */ + +/* Per-session TTL override (seconds). 0 = use table default. */ +static MYSQL_THDVAR_ULONGLONG(ttl, PLUGIN_VAR_RQCMDARG, + "Per-session TTL in seconds applied to INSERT/UPDATE; " + "0 means use the table-level TTL option; " + "can be set with SET [SESSION] tidesdb_ttl=N or " + "SET STATEMENT tidesdb_ttl=N FOR INSERT", + NULL, NULL, 0, 0, ULONGLONG_MAX, 0); + +/* Per-session skip unique check (for bulk loads where PK duplicates + are known impossible). Same pattern as MyRocks rocksdb_skip_unique_check. */ +static MYSQL_THDVAR_BOOL(skip_unique_check, PLUGIN_VAR_RQCMDARG, + "Skip uniqueness check on primary key and unique secondary indexes " + "during INSERT. Only safe when the application guarantees no " + "duplicates (e.g. bulk loads with monotonic PKs). " + "SET SESSION tidesdb_skip_unique_check=1", + NULL, NULL, 0); + +/* Per-session row-count threshold for the post-delete range compaction + trigger. Zero disables the feature. When non-zero, the engine tracks + the comparable min/max PK bytes touched by a single multi-row DELETE + statement (the start_bulk_delete / end_bulk_delete envelope around + range deletes) and, if the deleted row count is at least the threshold, + calls tidesdb_compact_range on the primary CF over the touched range + at end-of-statement to physically reclaim the freshly-tombstoned range + without waiting for a structural compaction trigger. Threshold avoids + making small DELETEs pay synchronous compaction cost. */ +static MYSQL_THDVAR_ULONGLONG( + compact_after_range_delete_min_rows, PLUGIN_VAR_RQCMDARG, + "If non-zero, after a multi-row DELETE statement that touches at least " + "this many rows, call tidesdb_compact_range over the touched primary-key " + "range to physically reclaim tombstoned space. Default 0 disables the " + "feature; set to 0 to keep the post-DELETE behavior unchanged", + NULL, NULL, 0, 0, ULONGLONG_MAX, 1); + +/* Per-session opt-in for single-delete semantics on the primary row CF. + Secondary-index deletes always use tidesdb_txn_single_delete because + each (col_values, pk) / (term, pk) / (hilbert, pk) composite is written + exactly once per row lifetime and deleted exactly once -- the + single-delete contract holds unconditionally for those. + For the primary CF the contract is narrower, UPDATE ... SET non_pk_col + writes tidesdb_txn_put(share->cf, data_key(pk), ...) with the same PK, + producing a put-over-put, and REPLACE INTO / INSERT ... ON DUPLICATE + KEY UPDATE on tables with no secondary indexes does the same via a + silent overwrite. Under either pattern, dropping a put+single-delete + pair at compaction can re-expose an older put. Enabling this variable + is the caller's promise that the session does none of the above -- + typical insert-then-delete, log-style, append-only workloads. */ +static MYSQL_THDVAR_BOOL(single_delete_primary, PLUGIN_VAR_RQCMDARG, + "Use single-delete semantics for the primary row CF on DELETE. " + "Caller promises no UPDATE on non-PK columns, no REPLACE INTO, " + "and no INSERT ... ON DUPLICATE KEY UPDATE on tables without " + "secondary indexes for this session. Violating the contract may " + "re-expose older row versions after compaction. Safe choice: " + "leave OFF unless the session is INSERT-and-DELETE only. " + "SET SESSION tidesdb_single_delete_primary=1", + NULL, NULL, 0); + +static MYSQL_THDVAR_ULONG(backpressure_wait_timeout_ms, PLUGIN_VAR_RQCMDARG, + "Milliseconds the plugin will block a writer on TidesDB " + "back-pressure (memtable/flush queue/L0 backlog at soft cap) " + "before surfacing it to the SQL layer as a lock-wait-timeout. " + "0 disables blocking and returns the timeout immediately", + NULL, NULL, TDB_BACKPRESSURE_DEFAULT_TIMEOUT_MS, + TDB_BACKPRESSURE_MIN_TIMEOUT_MS, TDB_BACKPRESSURE_MAX_TIMEOUT_MS, 0); + +static MYSQL_THDVAR_ULONG(lock_wait_timeout_ms, PLUGIN_VAR_RQCMDARG, + "Milliseconds a pessimistic row-lock acquire will wait " + "before returning HA_ERR_LOCK_WAIT_TIMEOUT. Mirrors " + "innodb_lock_wait_timeout (default 50000 = 50 s). " + "0 disables the timeout (wait bounded only by KILL QUERY)", + NULL, NULL, TDB_LOCK_WAIT_DEFAULT_TIMEOUT_MS, + TDB_LOCK_WAIT_MIN_TIMEOUT_MS, TDB_LOCK_WAIT_MAX_TIMEOUT_MS, 0); + +/* Definitions for the forward decls near tidesdb_txn_delete_cf -- placed here + so they have the THDVAR macros in scope. Each returns the configured wait + budget for the session, or the compile-time default when called without a + THD (e.g. background paths). */ +static ulong tdb_backpressure_timeout_ms(THD *thd) +{ + if (!thd) return TDB_BACKPRESSURE_DEFAULT_TIMEOUT_MS; + return THDVAR(thd, backpressure_wait_timeout_ms); +} + +static ulong tdb_lock_wait_timeout_ms(THD *thd) +{ + if (!thd) return TDB_LOCK_WAIT_DEFAULT_TIMEOUT_MS; + return THDVAR(thd, lock_wait_timeout_ms); +} + +/* Session-level defaults for table options. + These are used by HA_TOPTION_SYSVAR so that CREATE TABLE without + explicit options inherits the session/global default. Dynamic and + session-scoped, matching InnoDB's innodb_default_* pattern. */ + +static const char *compression_names[] = {"NONE", "SNAPPY", "LZ4", "ZSTD", "LZ4_FAST", NullS}; +static TYPELIB compression_typelib = {array_elements(compression_names) - 1, "compression_typelib", + compression_names, NULL, NULL}; + +static MYSQL_THDVAR_ENUM(default_compression, PLUGIN_VAR_RQCMDARG, + "Default compression algorithm for new tables " + "(NONE, SNAPPY, LZ4, ZSTD, LZ4_FAST)", + NULL, NULL, 2 /* LZ4 */, &compression_typelib); + +static MYSQL_THDVAR_ULONGLONG(default_write_buffer_size, PLUGIN_VAR_RQCMDARG, + "Default write buffer size in bytes for new tables", NULL, NULL, + TIDESQL_DEFAULT_WRITE_BUFFER_SIZE, 1024, ULONGLONG_MAX, 1024); + +static MYSQL_THDVAR_BOOL(default_bloom_filter, PLUGIN_VAR_RQCMDARG, + "Default bloom filter setting for new tables", NULL, NULL, 1); + +static MYSQL_THDVAR_BOOL(default_use_btree, PLUGIN_VAR_RQCMDARG, + "Default USE_BTREE setting for new tables (0=LSM, 1=B-tree)", NULL, NULL, + 0); + +static MYSQL_THDVAR_BOOL(default_block_indexes, PLUGIN_VAR_RQCMDARG, + "Default block indexes setting for new tables", NULL, NULL, 1); + +static const char *sync_mode_names[] = {"NONE", "INTERVAL", "FULL", NullS}; +static TYPELIB sync_mode_typelib = {array_elements(sync_mode_names) - 1, "sync_mode_typelib", + sync_mode_names, NULL, NULL}; + +static MYSQL_THDVAR_ENUM(default_sync_mode, PLUGIN_VAR_RQCMDARG, + "Default sync mode for new tables. Governs SSTable file sync " + "(klog and vlog). Under tidesdb_unified_memtable=ON the shared " + "WAL is fsynced according to tidesdb_unified_memtable_sync_mode " + "instead, so this option does not control WAL durability for " + "new tables. Choose NONE, INTERVAL or FULL", + NULL, NULL, 2 /* FULL */, &sync_mode_typelib); + +static MYSQL_THDVAR_ULONGLONG(default_sync_interval_us, PLUGIN_VAR_RQCMDARG, + "Default sync interval in microseconds for new tables " + "(used when SYNC_MODE=INTERVAL)", + NULL, NULL, TIDESQL_DEFAULT_SYNC_INTERVAL_US, 0, ULONGLONG_MAX, 1); + +static MYSQL_THDVAR_ULONGLONG(default_bloom_fpr, PLUGIN_VAR_RQCMDARG, + "Default bloom filter false positive rate for new tables " + "(parts per 10000; 100 = 1%%)", + NULL, NULL, 100, 1, 10000, 1); + +static MYSQL_THDVAR_ULONGLONG(default_klog_value_threshold, PLUGIN_VAR_RQCMDARG, + "Default klog value threshold in bytes for new tables " + "(values >= this go to vlog)", + NULL, NULL, TIDESQL_DEFAULT_KLOG_VALUE_THRESHOLD, 0, ULONGLONG_MAX, + 1); + +static MYSQL_THDVAR_ULONGLONG(default_l0_queue_stall_threshold, PLUGIN_VAR_RQCMDARG, + "Default L0 queue stall threshold for new tables", NULL, NULL, 10, 1, + 1024, 1); + +static MYSQL_THDVAR_ULONGLONG(default_l1_file_count_trigger, PLUGIN_VAR_RQCMDARG, + "Default L1 file count compaction trigger for new tables", NULL, NULL, + 4, 1, 1024, 1); + +static MYSQL_THDVAR_ULONGLONG(default_level_size_ratio, PLUGIN_VAR_RQCMDARG, + "Default level size ratio for new tables", NULL, NULL, + TIDESQL_DEFAULT_LEVEL_SIZE_RATIO, 2, 100, 1); + +static MYSQL_THDVAR_ULONGLONG(default_min_levels, PLUGIN_VAR_RQCMDARG, + "Default minimum LSM-tree levels for new tables. Matches " + "TIDESQL_DEFAULT_MIN_LEVELS in the TidesDB library", + NULL, NULL, TIDESQL_DEFAULT_MIN_LEVELS, 1, 64, 1); + +static MYSQL_THDVAR_ULONGLONG(default_dividing_level_offset, PLUGIN_VAR_RQCMDARG, + "Default dividing level offset for new tables. Matches " + "TIDESQL_DEFAULT_DIVIDING_LEVEL_OFFSET in the TidesDB library", + NULL, NULL, TIDESQL_DEFAULT_DIVIDING_LEVEL_OFFSET, 0, 64, 1); + +static MYSQL_THDVAR_ULONGLONG(default_skip_list_max_level, PLUGIN_VAR_RQCMDARG, + "Default skip list max level for new tables", NULL, NULL, 12, 1, 64, + 1); + +static MYSQL_THDVAR_ULONGLONG( + default_skip_list_probability, PLUGIN_VAR_RQCMDARG, + "Default skip list probability for new tables (percentage; 25 = 0.25)", NULL, NULL, 25, 1, 100, + 1); + +static MYSQL_THDVAR_ULONGLONG(default_index_sample_ratio, PLUGIN_VAR_RQCMDARG, + "Default block index sample ratio for new tables", NULL, NULL, + TIDESQL_DEFAULT_INDEX_SAMPLE_RATIO, 1, 1024, 1); + +static MYSQL_THDVAR_ULONGLONG(default_block_index_prefix_len, PLUGIN_VAR_RQCMDARG, + "Default block index prefix length for new tables", NULL, NULL, + TIDESQL_DEFAULT_BLOCK_INDEX_PREFIX_LEN, 1, 256, 1); + +static MYSQL_THDVAR_ULONGLONG(default_min_disk_space, PLUGIN_VAR_RQCMDARG, + "Default minimum disk space in bytes for new tables", NULL, NULL, + TIDESQL_DEFAULT_MIN_DISK_SPACE, 0, ULONGLONG_MAX, 1024); + +static MYSQL_THDVAR_BOOL(default_object_lazy_compaction, PLUGIN_VAR_RQCMDARG, + "Default object store lazy compaction for new tables. " + "When enabled, doubles the L1 file count compaction trigger " + "to reduce remote I/O at the cost of higher read amplification", + NULL, NULL, 0); + +static MYSQL_THDVAR_BOOL(default_object_prefetch_compaction, PLUGIN_VAR_RQCMDARG, + "Default object store prefetch compaction for new tables. " + "When enabled, downloads all input SSTables in parallel " + "before compaction merge begins", + NULL, NULL, 1); + +/* Tombstone-density compaction trigger (parts per 10000 -- 5000 = 0.50 ratio). + When non-zero, after each flush the engine inspects level-1 SSTables and + escalates compaction for any single SST whose tombstone count divided by + entry count exceeds this ratio while having at least + tombstone_density_min_entries entries. Default 0 keeps the existing + structural-trigger behavior. */ +static MYSQL_THDVAR_ULONGLONG(default_tombstone_density_trigger, PLUGIN_VAR_RQCMDARG, + "Default tombstone-density compaction trigger ratio for new tables, " + "expressed as parts per 10000 (5000 = 0.50, 0 disables). When set, " + "compaction is escalated for any level-1 SSTable whose tombstone " + "count divided by entry count exceeds the ratio", + NULL, NULL, 0, 0, 10000, 1); + +static MYSQL_THDVAR_ULONGLONG(default_tombstone_density_min_entries, PLUGIN_VAR_RQCMDARG, + "Minimum entry count for an SSTable to be considered by the " + "tombstone-density trigger; smaller SSTables are ignored", + NULL, NULL, 1024, 0, ULONGLONG_MAX, 1); + +static const char *isolation_level_names[] = { + "READ_UNCOMMITTED", "READ_COMMITTED", "REPEATABLE_READ", "SNAPSHOT", "SERIALIZABLE", NullS}; +static TYPELIB isolation_level_typelib = {array_elements(isolation_level_names) - 1, + "isolation_level_typelib", isolation_level_names, NULL, + NULL}; + +static MYSQL_THDVAR_ENUM(default_isolation_level, PLUGIN_VAR_RQCMDARG, + "Default isolation level for new tables " + "(READ_UNCOMMITTED, READ_COMMITTED, REPEATABLE_READ, SNAPSHOT, " + "SERIALIZABLE)", + NULL, NULL, 2 /* REPEATABLE_READ */, &isolation_level_typelib); + +static const char *log_level_names[] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL", "NONE", NullS}; +static TYPELIB log_level_typelib = {array_elements(log_level_names) - 1, "log_level_typelib", + log_level_names, NULL, NULL}; + +static MYSQL_SYSVAR_ULONG(flush_threads, srv_flush_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of TidesDB flush threads", NULL, NULL, 4, 1, 64, 0); + +static MYSQL_SYSVAR_ULONG(max_concurrent_flushes, srv_max_concurrent_flushes, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Global cap on in-flight memtable flushes. 0 (default) " + "aligns the cap with tidesdb_flush_threads so every " + "configured flush worker can run. Setting a cap below " + "tidesdb_flush_threads leaves workers idle and logs a " + "startup warning", + NULL, NULL, 0, 0, 1024, 0); + +static MYSQL_SYSVAR_ULONG(compaction_threads, srv_compaction_threads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Number of TidesDB compaction threads", NULL, NULL, 4, 1, 64, 0); + +static MYSQL_SYSVAR_ENUM(log_level, srv_log_level, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "TidesDB log level (DEBUG, INFO, WARN, ERROR, FATAL, NONE)", NULL, NULL, 0, + &log_level_typelib); + +/* Conflict information logging. + Similar to innodb_print_all_deadlocks -- logs all TDB_ERR_CONFLICT + events to the error log with transaction and table details. + (srv_print_all_conflicts, last_conflict_mutex, last_conflict_info + are forward-declared near tdb_rc_to_ha().) */ +static MYSQL_SYSVAR_BOOL(print_all_conflicts, srv_print_all_conflicts, PLUGIN_VAR_RQCMDARG, + "Log all TidesDB conflict errors to the error log " + "(similar to innodb_print_all_deadlocks)", + NULL, NULL, 0); + +static MYSQL_SYSVAR_BOOL(pessimistic_locking, srv_pessimistic_locking, PLUGIN_VAR_RQCMDARG, + "Enable plugin-level row locks for SELECT ... FOR UPDATE, " + "UPDATE, DELETE, and INSERT on user-defined primary keys. " + "ON (default): write-intent statements acquire per-row X locks " + "and plain reads under REPEATABLE_READ / SERIALIZABLE acquire " + "S locks; multiple S holders coexist, S blocks while an X is " + "waiting (writer fairness). Deadlock detection via wait-for " + "graph traversal; bounded by tidesdb_lock_wait_timeout_ms. " + "Locks held until COMMIT or ROLLBACK. Both explicit and " + "autocommit transactions participate. Locks can be acquired " + "on non-existing keys (e.g. SFU on a missing row blocks INSERT " + "of that key). " + "OFF: pure optimistic MVCC -- concurrent writers on the same " + "row are detected at COMMIT time (TDB_ERR_CONFLICT) and the " + "application must retry", + NULL, NULL, 1); + +static MYSQL_SYSVAR_ULONG(fts_min_word_len, srv_fts_min_word_len, PLUGIN_VAR_RQCMDARG, + "Minimum word length (in characters) for full-text indexing. " + "Shorter words are excluded from the index and search queries", + NULL, NULL, 3, 1, 84, 0); + +static MYSQL_SYSVAR_ULONG(fts_max_word_len, srv_fts_max_word_len, PLUGIN_VAR_RQCMDARG, + "Maximum word length (in characters) for full-text indexing. " + "Longer words are excluded from the index and search queries", + NULL, NULL, 84, 1, 512, 0); + +static MYSQL_SYSVAR_DOUBLE(fts_bm25_k1, srv_fts_bm25_k1, PLUGIN_VAR_RQCMDARG, + "BM25 k1 parameter controlling term-frequency saturation. " + "Higher values give more weight to repeated terms. " + "Standard default is 1.2", + NULL, NULL, 1.2, 0.0, 10.0, 0); + +static MYSQL_SYSVAR_DOUBLE(fts_bm25_b, srv_fts_bm25_b, PLUGIN_VAR_RQCMDARG, + "BM25 b parameter controlling document-length normalization. " + "0 = no normalization, 1 = full normalization. " + "Standard default is 0.75", + NULL, NULL, 0.75, 0.0, 1.0, 0); + +static MYSQL_SYSVAR_STR(fts_blend_chars, srv_fts_blend_chars, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "Characters treated as both separators and valid word characters " + "in full-text indexing. When a blend character appears inside a " + "token, the tokenizer emits the full blended form plus each " + "sub-part on either side. For example, with blend_chars=\"'\" " + "the input \"l'aria\" produces three tokens (l'aria, l, aria) " + "and the single-character \"l\" is then dropped by the default " + "tidesdb_fts_min_word_len=3. Set to \"'\" for Italian/French " + "elision support. Default is empty (no blend characters)", + NULL, tdb_fts_blend_chars_update, NULL); + +static MYSQL_SYSVAR_STR(ft_stopword_table, srv_ft_stopword_table, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "User-defined stop word table in 'db_name/table_name' format. " + "The table must have a VARCHAR column named 'value'. " + "When NULL (default), uses the same 36 default stop words as " + "information_schema.INNODB_FT_DEFAULT_STOPWORD. " + "Set to empty string to disable stop word filtering entirely", + NULL, tdb_ft_stopword_table_update, NULL); + +static MYSQL_SYSVAR_ULONGLONG(block_cache_size, srv_block_cache_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "TidesDB global block cache size in bytes", NULL, NULL, + TIDESDB_DEFAULT_BLOCK_CACHE, 0, ULONGLONG_MAX, 0); + +static MYSQL_SYSVAR_ULONG(max_open_sstables, srv_max_open_sstables, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Max cached SSTable structures in LRU cache", NULL, NULL, 256, 1, 65536, + 0); + +static MYSQL_SYSVAR_ULONGLONG(max_memory_usage, srv_max_memory_usage, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "TidesDB global memory limit in bytes " + "(0 = auto, 50% of system RAM; minimum 5% of system RAM)", + NULL, NULL, 0, 0, ULONGLONG_MAX, 0); + +static MYSQL_SYSVAR_BOOL(log_to_file, srv_log_to_file, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Write TidesDB logs to a LOG file in the data directory " + "instead of stderr (default: ON)", + NULL, NULL, 1); + +static MYSQL_SYSVAR_ULONGLONG(log_truncation_at, srv_log_truncation_at, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "TidesDB log file truncation size in bytes " + "(0 disables truncation)", + NULL, NULL, 24ULL * 1024 * 1024, 0, ULONGLONG_MAX, 0); + +static MYSQL_SYSVAR_BOOL(unified_memtable, srv_unified_memtable, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Use a single unified WAL and memtable across all column families. " + "Reduces WAL fsync overhead from O(num_tables) to O(1) and provides " + "atomic cross-CF commits. Best for multi-table OLTP workloads. " + "Requires all CFs to use the same comparator (default: ON)", + NULL, NULL, 1); + +static MYSQL_SYSVAR_ULONGLONG(unified_memtable_write_buffer_size, + srv_unified_memtable_write_buffer_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Write buffer size in bytes for the unified memtable. " + "0 = automatic (library default). Only meaningful when " + "tidesdb_unified_memtable=ON", + NULL, NULL, 256ULL * 1024 * 1024, 0, ULONGLONG_MAX, 0); + +static ulong srv_unified_memtable_sync_mode = 2; /* FULL */ + +static MYSQL_SYSVAR_ENUM(unified_memtable_sync_mode, srv_unified_memtable_sync_mode, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Sync mode for the unified WAL when tidesdb_unified_memtable=ON. " + "NONE relies on the OS page cache and is the fastest. INTERVAL " + "syncs periodically every unified_memtable_sync_interval_us. FULL " + "fsyncs on every commit and is the most durable. This setting " + "governs WAL durability for every table under unified mode " + "regardless of any per-table SYNC_MODE option, which only " + "controls SSTable file sync", + NULL, NULL, 2 /* FULL */, &sync_mode_typelib); + +static ulonglong srv_unified_memtable_sync_interval = 128000; + +static MYSQL_SYSVAR_ULONGLONG(unified_memtable_sync_interval, srv_unified_memtable_sync_interval, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Sync interval in microseconds for the unified WAL " + "(only used when unified_memtable_sync_mode=INTERVAL)", + NULL, NULL, 128000, 0, ULONGLONG_MAX, 0); + +/* Skip-list tuning for the unified memtable. Per-CF equivalents + (skip_list_max_level, skip_list_probability) exist as table options; + the unified-mode memtable uses a single skiplist for the whole DB so + it needs its own global knob. Default 0 / 0.0 keeps the library + default. */ +static ulong srv_unified_memtable_skip_list_max_level = 0; +static MYSQL_SYSVAR_ULONG( + unified_memtable_skip_list_max_level, srv_unified_memtable_skip_list_max_level, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Skip-list max level for the unified memtable; 0 keeps the library default", NULL, NULL, 0, 0, + 32, 0); + +static double srv_unified_memtable_skip_list_probability = 0.0; +static MYSQL_SYSVAR_DOUBLE( + unified_memtable_skip_list_probability, srv_unified_memtable_skip_list_probability, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Skip-list level promotion probability for the unified memtable; 0.0 keeps the library default", + NULL, NULL, 0.0, 0.0, 1.0, 0); + +/* Configurable data directory. + Defaults to NULL which means the plugin computes a sibling directory + of mysql_real_data_home. Setting this overrides the auto-computed path. */ +static char *srv_data_home_dir = NULL; + +static MYSQL_SYSVAR_STR(data_home_dir, srv_data_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Directory where TidesDB stores its data files; " + "defaults to /../tidesdb_data; " + "must be set before server startup (read-only)", + NULL, NULL, NULL); + +/* ******************** Object Store Configuration ******************** */ + +/* Object store backend (0=LOCAL (no object store), 1=S3) */ +static ulong srv_object_store_backend = 0; +static const char *object_store_backend_names[] = {"LOCAL", "S3", NullS}; +static TYPELIB object_store_backend_typelib = {array_elements(object_store_backend_names) - 1, + "object_store_backend_typelib", + object_store_backend_names, NULL, NULL}; +static MYSQL_SYSVAR_ENUM(object_store_backend, srv_object_store_backend, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Object store backend (LOCAL=disabled, S3=S3-compatible)", NULL, NULL, 0, + &object_store_backend_typelib); + +static char *srv_s3_endpoint = NULL; +static MYSQL_SYSVAR_STR(s3_endpoint, srv_s3_endpoint, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 endpoint (e.g. s3.amazonaws.com or minio.local:9000)", NULL, NULL, + NULL); + +static char *srv_s3_bucket = NULL; +static MYSQL_SYSVAR_STR(s3_bucket, srv_s3_bucket, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 bucket name", NULL, NULL, NULL); + +static char *srv_s3_prefix = NULL; +static MYSQL_SYSVAR_STR(s3_prefix, srv_s3_prefix, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 key prefix (e.g. production/db1/)", NULL, NULL, NULL); + +static char *srv_s3_access_key = NULL; +static MYSQL_SYSVAR_STR(s3_access_key, srv_s3_access_key, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 access key ID", NULL, NULL, NULL); + +static char *srv_s3_secret_key = NULL; +static MYSQL_SYSVAR_STR(s3_secret_key, srv_s3_secret_key, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 secret access key", NULL, NULL, NULL); + +static char *srv_s3_region = NULL; +static MYSQL_SYSVAR_STR(s3_region, srv_s3_region, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 region (e.g. us-east-1, NULL for MinIO)", NULL, NULL, NULL); + +static my_bool srv_s3_use_ssl = 1; +static MYSQL_SYSVAR_BOOL(s3_use_ssl, srv_s3_use_ssl, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Use HTTPS for S3 connections (default ON)", NULL, NULL, 1); + +static my_bool srv_s3_path_style = 0; +static MYSQL_SYSVAR_BOOL(s3_path_style, srv_s3_path_style, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Use path-style S3 URLs (required for MinIO, default OFF)", NULL, NULL, 0); + +static char *srv_s3_tls_ca_path = NULL; +static MYSQL_SYSVAR_STR( + s3_tls_ca_path, srv_s3_tls_ca_path, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to a custom CA bundle for the S3 TLS handshake, or empty to use the system bundle", NULL, + NULL, NULL); + +static my_bool srv_s3_tls_insecure_skip_verify = 0; +static MYSQL_SYSVAR_BOOL( + s3_tls_insecure_skip_verify, srv_s3_tls_insecure_skip_verify, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Disable S3 TLS peer/host verification. INSECURE, intended for test endpoints only.", NULL, + NULL, 0); + +static ulonglong srv_s3_multipart_threshold = 0; +static MYSQL_SYSVAR_ULONGLONG( + s3_multipart_threshold, srv_s3_multipart_threshold, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Object size in bytes at which S3 multipart upload activates. 0 keeps the library default.", + NULL, NULL, 0, 0, ULONGLONG_MAX, 0); + +static ulonglong srv_s3_multipart_part_size = 0; +static MYSQL_SYSVAR_ULONGLONG(s3_multipart_part_size, srv_s3_multipart_part_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "S3 multipart chunk size in bytes. 0 keeps the library default.", + NULL, NULL, 0, 0, ULONGLONG_MAX, 0); + +static ulonglong srv_objstore_local_cache_max = 0; +static MYSQL_SYSVAR_ULONGLONG( + objstore_local_cache_max, srv_objstore_local_cache_max, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Maximum local cache size in bytes for object store mode (0=unlimited)", NULL, NULL, 0, 0, + ULONGLONG_MAX, 0); + +static ulonglong srv_objstore_wal_sync_threshold = 1048576; +static MYSQL_SYSVAR_ULONGLONG( + objstore_wal_sync_threshold, srv_objstore_wal_sync_threshold, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Sync active WAL to object store when it grows by this many bytes (default 1MB, 0=disable)", + NULL, NULL, 1048576, 0, ULONGLONG_MAX, 0); + +static my_bool srv_objstore_wal_sync_on_commit = 0; +static MYSQL_SYSVAR_BOOL(objstore_wal_sync_on_commit, srv_objstore_wal_sync_on_commit, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Upload WAL after every commit for RPO=0 replication (default OFF)", NULL, + NULL, 0); + +static my_bool srv_replica_mode = 0; +static MYSQL_SYSVAR_BOOL(replica_mode, srv_replica_mode, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Enable read-only replica mode (default OFF)", NULL, NULL, 0); + +/* When ON, deinit calls tidesdb_cancel_background_work before tidesdb_close + so in-flight compactions bail at their next checkpoint (uncommitted output + discarded, inputs intact) and shutdown returns quickly even with a multi-GB + compaction backlog. Default OFF restores pre-4.5.4 behaviour where + tidesdb_close drains background work naturally; this is the safer setting + for object-store / replica setups where a mid-compaction cancel can leave + S3 in an inconsistent state that confuses a syncing replica. */ +static my_bool srv_fast_shutdown = 0; +static MYSQL_SYSVAR_BOOL(fast_shutdown, srv_fast_shutdown, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Call tidesdb_cancel_background_work at deinit so shutdown does not " + "wait for in-flight compactions to drain. Default OFF; turn ON only " + "when shutdown latency on a large compaction backlog matters more " + "than clean handoff to replicas reading the object store", + NULL, NULL, 0); + +static my_bool srv_objstore_cache_on_read = 1; +static MYSQL_SYSVAR_BOOL(objstore_cache_on_read, srv_objstore_cache_on_read, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Cache downloaded objects in the local cache (default ON)", NULL, NULL, 1); + +static my_bool srv_objstore_cache_on_write = 1; +static MYSQL_SYSVAR_BOOL(objstore_cache_on_write, srv_objstore_cache_on_write, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Cache uploaded objects in the local cache (default ON)", NULL, NULL, 1); + +static ulong srv_objstore_max_concurrent_uploads = 0; +static MYSQL_SYSVAR_ULONG(objstore_max_concurrent_uploads, srv_objstore_max_concurrent_uploads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Concurrent upload threads; 0 uses the library default", NULL, NULL, 0, 0, + 1024, 0); + +static ulong srv_objstore_max_concurrent_downloads = 0; +static MYSQL_SYSVAR_ULONG(objstore_max_concurrent_downloads, srv_objstore_max_concurrent_downloads, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Concurrent download threads; 0 uses the library default", NULL, NULL, 0, + 0, 1024, 0); + +static ulonglong srv_objstore_multipart_threshold = 0; +static MYSQL_SYSVAR_ULONGLONG( + objstore_multipart_threshold, srv_objstore_multipart_threshold, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Object size in bytes that triggers multipart upload; 0 keeps the library default", NULL, NULL, + 0, 0, ULONGLONG_MAX, 0); + +static ulonglong srv_objstore_multipart_part_size = 0; +static MYSQL_SYSVAR_ULONGLONG(objstore_multipart_part_size, srv_objstore_multipart_part_size, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Multipart upload chunk size in bytes; 0 keeps the library default", + NULL, NULL, 0, 0, ULONGLONG_MAX, 0); + +static my_bool srv_objstore_sync_manifest_to_object = 1; +static MYSQL_SYSVAR_BOOL(objstore_sync_manifest_to_object, srv_objstore_sync_manifest_to_object, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Upload MANIFEST after each compaction (default ON)", NULL, NULL, 1); + +static my_bool srv_objstore_wal_upload_sync = 0; +static MYSQL_SYSVAR_BOOL( + objstore_wal_upload_sync, srv_objstore_wal_upload_sync, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Block memtable flush on WAL upload (default OFF for background WAL upload)", NULL, NULL, 0); + +static my_bool srv_objstore_replicate_wal = 1; +static MYSQL_SYSVAR_BOOL(objstore_replicate_wal, srv_objstore_replicate_wal, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Upload WAL segments for replica recovery (default ON)", NULL, NULL, 1); + +static my_bool srv_objstore_replica_replay_wal = 1; +static MYSQL_SYSVAR_BOOL(objstore_replica_replay_wal, srv_objstore_replica_replay_wal, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Replay WAL on replicas for near-real-time visibility (default ON)", NULL, + NULL, 1); + +static ulonglong srv_replica_sync_interval = 5000000; +static MYSQL_SYSVAR_ULONGLONG( + replica_sync_interval, srv_replica_sync_interval, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "MANIFEST poll interval for replica sync in microseconds (default 5s)", NULL, NULL, 5000000, + 100000, ULONGLONG_MAX, 0); + +/* Promote replica to primary -- trigger variable (like backup_dir) */ +static my_bool srv_promote_primary = 0; +static void tidesdb_promote_primary_update(THD *thd, struct st_mysql_sys_var *, void *var_ptr, + const void *save) +{ + my_bool val = *static_cast(save); + if (!val) return; /* only act on SET ... = ON */ + + if (!tdb_global) + { + my_error(ER_UNKNOWN_ERROR, MYF(0)); + return; + } + + int rc = tidesdb_promote_to_primary(tdb_global); + if (rc == TDB_SUCCESS) + { + sql_print_information("[TIDESDB] Replica promoted to primary successfully"); + } + else + { + sql_print_error("[TIDESDB] Failed to promote replica (err=%d)", rc); + } + + /* reset to OFF so it can be triggered again */ + *static_cast(var_ptr) = 0; +} + +static MYSQL_SYSVAR_BOOL(promote_primary, srv_promote_primary, PLUGIN_VAR_RQCMDARG, + "Set to ON to promote this replica to primary (trigger, resets to OFF)", + NULL, tidesdb_promote_primary_update, 0); + +/* ******************** Online backup via system variable ******************** */ + +static char *srv_backup_dir = NULL; + +static void tidesdb_backup_dir_update(THD *thd, struct st_mysql_sys_var *, void *var_ptr, + const void *save) +{ + const char *new_dir = *static_cast(save); + + if (!new_dir || !new_dir[0]) + { + /* Empty string -- we just clear the variable */ + *static_cast(var_ptr) = NULL; + return; + } + + if (!tdb_global) + { + my_error(ER_UNKNOWN_ERROR, MYF(0), "TidesDB is not open"); + return; + } + + /* Free the calling connection's TidesDB transaction before backup. + tidesdb_backup() waits for all open transactions to drain. The + connection may still hold an open txn (created in external_lock + but not yet committed). If we don't free it here, the backup + self-deadlocks waiting for our own txn. */ + { + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (trx && trx->txn) + { + tidesdb_txn_rollback(trx->txn); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + trx->dirty = false; + trx->txn_generation++; + trx->fts_meta_pending.clear(); + trx->fts_meta_dirty = false; + } + } + + /* We copy the path before releasing the sysvar lock -- the save pointer + is only valid while LOCK_global_system_variables is held. */ + std::string backup_path(new_dir); + + /* tidesdb_backup() spins waiting for all CF flushes to complete. + The library's flush threads call sql_print_information() which + internally acquires LOCK_global_system_variables. This sysvar + update callback is called WITH that mutex held, so tidesdb_backup() + deadlocks (flush thread waits for lock, we wait for flush thread). + Release the mutex around the blocking backup call. */ + mysql_mutex_unlock(&LOCK_global_system_variables); + + /* Backup started -- no log (user-triggered, success/failure reported via return code) */ + + char *backup_path_c = const_cast(backup_path.c_str()); + int rc = tidesdb_backup(tdb_global, backup_path_c); + + mysql_mutex_lock(&LOCK_global_system_variables); + + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] Backup to '%s' failed (err=%d)", backup_path.c_str(), rc); + my_printf_error(ER_UNKNOWN_ERROR, "[TIDESDB] Backup to '%s' failed (err=%d)", MYF(0), + backup_path.c_str(), rc); + return; + } + + /* For PLUGIN_VAR_MEMALLOC strings, the framework manages memory. + We set var_ptr to the save value so the framework copies it. */ + *static_cast(var_ptr) = new_dir; +} + +static MYSQL_SYSVAR_STR(backup_dir, srv_backup_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "Set to a directory path to trigger an online TidesDB backup. " + "The directory must not exist or be empty. " + "Example: SET GLOBAL tidesdb_backup_dir = '/path/to/backup'", + NULL, tidesdb_backup_dir_update, NULL); + +/* Checkpoint (hard-link snapshot) via system variable */ + +static char *srv_checkpoint_dir = NULL; + +static void tidesdb_checkpoint_dir_update(THD *thd, struct st_mysql_sys_var *, void *var_ptr, + const void *save) +{ + const char *new_dir = *static_cast(save); + + if (!new_dir || !new_dir[0]) + { + *static_cast(var_ptr) = NULL; + return; + } + + if (!tdb_global) + { + my_error(ER_UNKNOWN_ERROR, MYF(0), "TidesDB is not open"); + return; + } + + /* Checkpoint started -- no log */ + + int rc = tidesdb_checkpoint(tdb_global, new_dir); + + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] Checkpoint to '%s' failed (err=%d)", new_dir, rc); + my_printf_error(ER_UNKNOWN_ERROR, "[TIDESDB] Checkpoint to '%s' failed (err=%d)", MYF(0), + new_dir, rc); + return; + } + + /* Checkpoint completed -- no log */ + *static_cast(var_ptr) = new_dir; +} + +static MYSQL_SYSVAR_STR(checkpoint_dir, srv_checkpoint_dir, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, + "Set to a directory path to trigger a TidesDB checkpoint " + "(hard-link snapshot, near-instant). " + "The directory must not exist or be empty. " + "Example: SET GLOBAL tidesdb_checkpoint_dir = '/path/to/checkpoint'", + NULL, tidesdb_checkpoint_dir_update, NULL); + +static struct st_mysql_sys_var *tidesdb_system_variables[] = { + MYSQL_SYSVAR(flush_threads), + MYSQL_SYSVAR(max_concurrent_flushes), + MYSQL_SYSVAR(compaction_threads), + MYSQL_SYSVAR(log_level), + MYSQL_SYSVAR(block_cache_size), + MYSQL_SYSVAR(max_open_sstables), + MYSQL_SYSVAR(max_memory_usage), + MYSQL_SYSVAR(backup_dir), + MYSQL_SYSVAR(checkpoint_dir), + MYSQL_SYSVAR(print_all_conflicts), + MYSQL_SYSVAR(pessimistic_locking), + MYSQL_SYSVAR(fts_min_word_len), + MYSQL_SYSVAR(fts_max_word_len), + MYSQL_SYSVAR(fts_bm25_k1), + MYSQL_SYSVAR(fts_bm25_b), + MYSQL_SYSVAR(ft_stopword_table), + MYSQL_SYSVAR(fts_blend_chars), + MYSQL_SYSVAR(data_home_dir), + MYSQL_SYSVAR(ttl), + MYSQL_SYSVAR(skip_unique_check), + MYSQL_SYSVAR(single_delete_primary), + MYSQL_SYSVAR(backpressure_wait_timeout_ms), + MYSQL_SYSVAR(lock_wait_timeout_ms), + MYSQL_SYSVAR(compact_after_range_delete_min_rows), + MYSQL_SYSVAR(default_compression), + MYSQL_SYSVAR(default_write_buffer_size), + MYSQL_SYSVAR(default_bloom_filter), + MYSQL_SYSVAR(default_use_btree), + MYSQL_SYSVAR(default_block_indexes), + MYSQL_SYSVAR(default_sync_mode), + MYSQL_SYSVAR(default_sync_interval_us), + MYSQL_SYSVAR(default_bloom_fpr), + MYSQL_SYSVAR(default_klog_value_threshold), + MYSQL_SYSVAR(default_l0_queue_stall_threshold), + MYSQL_SYSVAR(default_l1_file_count_trigger), + MYSQL_SYSVAR(default_level_size_ratio), + MYSQL_SYSVAR(default_min_levels), + MYSQL_SYSVAR(default_dividing_level_offset), + MYSQL_SYSVAR(default_skip_list_max_level), + MYSQL_SYSVAR(default_skip_list_probability), + MYSQL_SYSVAR(default_index_sample_ratio), + MYSQL_SYSVAR(default_block_index_prefix_len), + MYSQL_SYSVAR(default_min_disk_space), + MYSQL_SYSVAR(default_isolation_level), + MYSQL_SYSVAR(log_to_file), + MYSQL_SYSVAR(log_truncation_at), + MYSQL_SYSVAR(unified_memtable), + MYSQL_SYSVAR(unified_memtable_write_buffer_size), + MYSQL_SYSVAR(unified_memtable_sync_mode), + MYSQL_SYSVAR(unified_memtable_sync_interval), + MYSQL_SYSVAR(unified_memtable_skip_list_max_level), + MYSQL_SYSVAR(unified_memtable_skip_list_probability), + MYSQL_SYSVAR(object_store_backend), + MYSQL_SYSVAR(s3_endpoint), + MYSQL_SYSVAR(s3_bucket), + MYSQL_SYSVAR(s3_prefix), + MYSQL_SYSVAR(s3_access_key), + MYSQL_SYSVAR(s3_secret_key), + MYSQL_SYSVAR(s3_region), + MYSQL_SYSVAR(s3_use_ssl), + MYSQL_SYSVAR(s3_path_style), + MYSQL_SYSVAR(s3_tls_ca_path), + MYSQL_SYSVAR(s3_tls_insecure_skip_verify), + MYSQL_SYSVAR(s3_multipart_threshold), + MYSQL_SYSVAR(s3_multipart_part_size), + MYSQL_SYSVAR(objstore_local_cache_max), + MYSQL_SYSVAR(objstore_wal_sync_threshold), + MYSQL_SYSVAR(objstore_wal_sync_on_commit), + MYSQL_SYSVAR(objstore_cache_on_read), + MYSQL_SYSVAR(objstore_cache_on_write), + MYSQL_SYSVAR(objstore_max_concurrent_uploads), + MYSQL_SYSVAR(objstore_max_concurrent_downloads), + MYSQL_SYSVAR(objstore_multipart_threshold), + MYSQL_SYSVAR(objstore_multipart_part_size), + MYSQL_SYSVAR(objstore_sync_manifest_to_object), + MYSQL_SYSVAR(objstore_wal_upload_sync), + MYSQL_SYSVAR(objstore_replicate_wal), + MYSQL_SYSVAR(objstore_replica_replay_wal), + MYSQL_SYSVAR(replica_mode), + MYSQL_SYSVAR(fast_shutdown), + MYSQL_SYSVAR(replica_sync_interval), + MYSQL_SYSVAR(promote_primary), + MYSQL_SYSVAR(default_object_lazy_compaction), + MYSQL_SYSVAR(default_object_prefetch_compaction), + MYSQL_SYSVAR(default_tombstone_density_trigger), + MYSQL_SYSVAR(default_tombstone_density_min_entries), + NULL}; + +/* ******************** Table options (per-table CF config) ******************** */ + +struct ha_table_option_struct +{ + ulonglong write_buffer_size; + ulonglong min_disk_space; + ulonglong klog_value_threshold; + ulonglong sync_interval_us; + ulonglong index_sample_ratio; + ulonglong block_index_prefix_len; + ulonglong level_size_ratio; + ulonglong min_levels; + ulonglong dividing_level_offset; + ulonglong skip_list_max_level; + ulonglong skip_list_probability; /* percentage -- 25 = 0.25 */ + ulonglong bloom_fpr; /* parts per 10000 -- 100 = 1% */ + ulonglong l1_file_count_trigger; + ulonglong l0_queue_stall_threshold; + uint compression; + uint sync_mode; + uint isolation_level; + bool bloom_filter; + bool block_indexes; + bool use_btree; + bool object_lazy_compaction; /* double L1 file count trigger in object store mode */ + bool object_prefetch_compaction; /* prefetch input SSTables before compaction merge */ + ulonglong ttl; /* default TTL in seconds (0 = no expiration) */ + bool encrypted; /* ENCRYPTED=YES enables data-at-rest encryption */ + ulonglong encryption_key_id; /* ENCRYPTION_KEY_ID (default 1) */ + /* Tombstone-density compaction trigger. Stored as parts-per-10000 + (e.g. 5000 = 0.50 ratio) so the option list can use integer storage; + converted to a double at build_cf_config time. */ + ulonglong tombstone_density_trigger; + ulonglong tombstone_density_min_entries; +}; + +ha_create_table_option tidesdb_table_option_list[] = { + /* Options with SYSVAR defaults inherit from session variables + (e.g. SET SESSION tidesdb_default_write_buffer_size=64*1024*1024). + When not explicitly set in CREATE TABLE, the session default is used. */ + HA_TOPTION_SYSVAR("WRITE_BUFFER_SIZE", write_buffer_size, default_write_buffer_size), + HA_TOPTION_SYSVAR("MIN_DISK_SPACE", min_disk_space, default_min_disk_space), + HA_TOPTION_SYSVAR("KLOG_VALUE_THRESHOLD", klog_value_threshold, default_klog_value_threshold), + HA_TOPTION_SYSVAR("SYNC_INTERVAL_US", sync_interval_us, default_sync_interval_us), + HA_TOPTION_SYSVAR("INDEX_SAMPLE_RATIO", index_sample_ratio, default_index_sample_ratio), + HA_TOPTION_SYSVAR("BLOCK_INDEX_PREFIX_LEN", block_index_prefix_len, + default_block_index_prefix_len), + HA_TOPTION_SYSVAR("LEVEL_SIZE_RATIO", level_size_ratio, default_level_size_ratio), + HA_TOPTION_SYSVAR("MIN_LEVELS", min_levels, default_min_levels), + HA_TOPTION_SYSVAR("DIVIDING_LEVEL_OFFSET", dividing_level_offset, + default_dividing_level_offset), + HA_TOPTION_SYSVAR("SKIP_LIST_MAX_LEVEL", skip_list_max_level, default_skip_list_max_level), + HA_TOPTION_SYSVAR("SKIP_LIST_PROBABILITY", skip_list_probability, + default_skip_list_probability), + HA_TOPTION_SYSVAR("BLOOM_FPR", bloom_fpr, default_bloom_fpr), + HA_TOPTION_SYSVAR("L1_FILE_COUNT_TRIGGER", l1_file_count_trigger, + default_l1_file_count_trigger), + HA_TOPTION_SYSVAR("L0_QUEUE_STALL_THRESHOLD", l0_queue_stall_threshold, + default_l0_queue_stall_threshold), + HA_TOPTION_SYSVAR("COMPRESSION", compression, default_compression), + HA_TOPTION_SYSVAR("SYNC_MODE", sync_mode, default_sync_mode), + HA_TOPTION_SYSVAR("ISOLATION_LEVEL", isolation_level, default_isolation_level), + HA_TOPTION_SYSVAR("BLOOM_FILTER", bloom_filter, default_bloom_filter), + HA_TOPTION_SYSVAR("BLOCK_INDEXES", block_indexes, default_block_indexes), + HA_TOPTION_SYSVAR("USE_BTREE", use_btree, default_use_btree), + HA_TOPTION_SYSVAR("OBJECT_LAZY_COMPACTION", object_lazy_compaction, + default_object_lazy_compaction), + HA_TOPTION_SYSVAR("OBJECT_PREFETCH_COMPACTION", object_prefetch_compaction, + default_object_prefetch_compaction), + HA_TOPTION_SYSVAR("TOMBSTONE_DENSITY_TRIGGER", tombstone_density_trigger, + default_tombstone_density_trigger), + HA_TOPTION_SYSVAR("TOMBSTONE_DENSITY_MIN_ENTRIES", tombstone_density_min_entries, + default_tombstone_density_min_entries), + HA_TOPTION_NUMBER("TTL", ttl, 0, 0, ULONGLONG_MAX, 1), + HA_TOPTION_BOOL("ENCRYPTED", encrypted, 0), + HA_TOPTION_NUMBER("ENCRYPTION_KEY_ID", encryption_key_id, 1, 1, 255, 1), + HA_TOPTION_END}; + +/* ******************** Field options (per-column) ******************** */ + +struct ha_field_option_struct +{ + bool ttl; /* marks this column as the per-row TTL source (seconds) */ +}; + +ha_create_table_option tidesdb_field_option_list[] = {HA_FOPTION_BOOL("TTL", ttl, 0), + HA_FOPTION_END}; + +/* ******************** Index options (per-index) ******************** */ + +struct ha_index_option_struct +{ + bool use_btree; /* per-index B-tree override */ +}; + +ha_create_table_option tidesdb_index_option_list[] = {HA_IOPTION_BOOL("USE_BTREE", use_btree, 0), + HA_IOPTION_END}; + +/* ******************** Big-endian helpers for hidden PK ******************** + Hidden-PK rows are keyed by an 8-byte big-endian uint64 so that memcmp + on the encoded bytes matches numeric ordering of the row id. */ + +static void encode_be64(uint64_t id, uint8_t *buf) +{ + for (uint i = 0; i < sizeof(uint64_t); i++) + buf[i] = (uint8_t)(id >> ((sizeof(uint64_t) - 1 - i) * BITS_PER_BYTE)); +} + +static uint64_t decode_be64(const uint8_t *buf) +{ + uint64_t id = 0; + for (uint i = 0; i < sizeof(uint64_t); i++) id = (id << BITS_PER_BYTE) | (uint64_t)buf[i]; + return id; +} + +/* + Return true if a TidesDB key is a data key (starts with KEY_NS_DATA). +*/ +static inline bool is_data_key(const uint8_t *key, size_t key_size) +{ + return key_size > 0 && key[0] == KEY_NS_DATA; +} + +/* Shared enum-to-constant maps (used by create, open, prepare_inplace) */ + +static const int tdb_compression_map[] = {TDB_COMPRESS_NONE, TDB_COMPRESS_SNAPPY, TDB_COMPRESS_LZ4, + TDB_COMPRESS_ZSTD, TDB_COMPRESS_LZ4_FAST}; + +static const int tdb_sync_mode_map[] = {TDB_SYNC_NONE, TDB_SYNC_INTERVAL, TDB_SYNC_FULL}; + +static const int tdb_isolation_map[] = {TDB_ISOLATION_READ_UNCOMMITTED, + TDB_ISOLATION_READ_COMMITTED, TDB_ISOLATION_REPEATABLE_READ, + TDB_ISOLATION_SNAPSHOT, TDB_ISOLATION_SERIALIZABLE}; + +/* + Map the MariaDB session isolation level (from SET TRANSACTION ISOLATION + LEVEL) to a TidesDB isolation level. An explicitly chosen session level + always wins. When the session is left at the SQL default of REPEATABLE + READ the table-level ISOLATION_LEVEL option decides, because that is the + signal that the client expressed no preference of its own. + + The MariaDB enum_tx_isolation values are ISO_READ_UNCOMMITTED 0, + ISO_READ_COMMITTED 1, ISO_REPEATABLE_READ 2 and ISO_SERIALIZABLE 3. + + TidesDB has a fifth level, SNAPSHOT, with no SQL equivalent. A table + that leaves ISOLATION_LEVEL at REPEATABLE READ resolves to SNAPSHOT for + InnoDB parity, since TidesDB's strict REPEATABLE_READ tracks the read + set and produces excessive TDB_ERR_CONFLICT under normal OLTP. A table + that sets SNAPSHOT, SERIALIZABLE, READ COMMITTED or READ UNCOMMITTED is + honored as written. +*/ +static tidesdb_isolation_level_t resolve_effective_isolation(THD *thd, + tidesdb_isolation_level_t table_iso) +{ + int session_iso = thd_tx_isolation(thd); + + switch (session_iso) + { + case ISO_READ_UNCOMMITTED: + return TDB_ISOLATION_READ_UNCOMMITTED; + case ISO_READ_COMMITTED: + return TDB_ISOLATION_READ_COMMITTED; + case ISO_REPEATABLE_READ: + /* The session is at the SQL default, so the table-level + ISOLATION_LEVEL option decides. A table left at REPEATABLE + READ maps to TidesDB SNAPSHOT for InnoDB parity, since + TidesDB's strict REPEATABLE_READ tracks the read set and + produces excessive TDB_ERR_CONFLICT under normal OLTP. An + explicit SNAPSHOT, SERIALIZABLE, READ COMMITTED or READ + UNCOMMITTED table option is honored as written. */ + return table_iso == TDB_ISOLATION_REPEATABLE_READ ? TDB_ISOLATION_SNAPSHOT : table_iso; + case ISO_SERIALIZABLE: + return TDB_ISOLATION_SERIALIZABLE; + default: + return TDB_ISOLATION_READ_COMMITTED; + } +} + +/* Single-byte placeholder value for secondary index entries (all info is in the key) */ +static const uint8_t tdb_empty_val = 0; + +/* + Build a tidesdb_column_family_config_t from table options. + Centralises the option-to-config mapping so create() and + prepare_inplace_alter_table() stay in sync. +*/ +static tidesdb_column_family_config_t build_cf_config(const ha_table_option_struct *opts) +{ + tidesdb_column_family_config_t cfg = tidesdb_default_column_family_config(); + if (!opts) return cfg; + + cfg.write_buffer_size = (size_t)opts->write_buffer_size; + cfg.compression_algorithm = (compression_algorithm)tdb_compression_map[opts->compression]; + cfg.enable_bloom_filter = opts->bloom_filter ? 1 : 0; + cfg.bloom_fpr = (double)opts->bloom_fpr / TIDESDB_BLOOM_FPR_DIVISOR; + cfg.enable_block_indexes = opts->block_indexes ? 1 : 0; + cfg.index_sample_ratio = (int)opts->index_sample_ratio; + cfg.block_index_prefix_len = (int)opts->block_index_prefix_len; + cfg.sync_mode = tdb_sync_mode_map[opts->sync_mode]; + cfg.sync_interval_us = (uint64_t)opts->sync_interval_us; + cfg.klog_value_threshold = (size_t)opts->klog_value_threshold; + cfg.min_disk_space = (size_t)opts->min_disk_space; + cfg.default_isolation_level = + (tidesdb_isolation_level_t)tdb_isolation_map[opts->isolation_level]; + cfg.level_size_ratio = (int)opts->level_size_ratio; + cfg.min_levels = (int)opts->min_levels; + cfg.dividing_level_offset = (int)opts->dividing_level_offset; + cfg.skip_list_max_level = (int)opts->skip_list_max_level; + cfg.skip_list_probability = (float)opts->skip_list_probability / TIDESDB_SKIP_LIST_PROB_DIV; + cfg.l1_file_count_trigger = (int)opts->l1_file_count_trigger; + cfg.l0_queue_stall_threshold = (int)opts->l0_queue_stall_threshold; + cfg.use_btree = opts->use_btree ? 1 : 0; + cfg.object_lazy_compaction = opts->object_lazy_compaction ? 1 : 0; + cfg.object_prefetch_compaction = opts->object_prefetch_compaction ? 1 : 0; + cfg.tombstone_density_trigger = + (double)opts->tombstone_density_trigger / TIDESDB_TOMBSTONE_DENSITY_DIVISOR; + cfg.tombstone_density_min_entries = (uint64_t)opts->tombstone_density_min_entries; + return cfg; +} + +/* + Resolve a secondary index CF by name. + Returns the CF pointer (may be NULL if not found). + Writes the CF name into out_name. +*/ +static tidesdb_column_family_t *resolve_idx_cf(tidesdb_t *db, const std::string &table_cf, + const char *key_name, std::string &out_name) +{ + out_name = table_cf + CF_INDEX_INFIX + key_name; + return tidesdb_get_column_family(db, out_name.c_str()); +} + +/* ******************** TidesDB_share ******************** */ + +TidesDB_share::TidesDB_share() + : cf(NULL), + has_user_pk(false), + pk_index(0), + pk_key_len(0), + next_row_id(1), + isolation_level(TDB_ISOLATION_REPEATABLE_READ), + default_ttl(0), + ttl_field_idx(TIDESDB_TTL_FIELD_NONE), + encrypted(false), + encryption_key_id(TIDESDB_DEFAULT_ENCRYPTION_KEY_ID), + encryption_key_version(0), + has_blobs(false), + has_ttl(false), + num_secondary_indexes(0) +{ + memset(idx_comp_key_len, 0, sizeof(idx_comp_key_len)); + memset(idx_is_fts, 0, sizeof(idx_is_fts)); + memset(idx_is_spatial, 0, sizeof(idx_is_spatial)); + for (uint i = 0; i < MAX_KEY; i++) cached_rec_per_key[i].store(0, std::memory_order_relaxed); +} + +TidesDB_share::~TidesDB_share() +{ +} + +/* ******************** Per-connection transaction helpers ******************** */ + +/* + Get or create the per-connection TidesDB transaction context. + The txn lives for the entire BEGIN...COMMIT block (or single auto-commit + statement). All handler objects on the same connection share it. +*/ +static tidesdb_trx_t *get_or_create_trx(THD *thd, handlerton *hton, tidesdb_isolation_level_t iso) +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, hton); + if (trx) + { + if (!trx->txn) + { + int rc = tidesdb_txn_begin_with_isolation(tdb_global, iso, &trx->txn); + if (rc != TDB_SUCCESS) + { + (void)tdb_rc_to_ha(rc, "get_or_create_trx txn_begin(reuse)"); + return NULL; + } + trx->dirty = false; + trx->isolation_level = iso; + trx->txn_generation++; + } + else if (trx->needs_reset) + { + /* Txn object kept alive from previous commit/rollback (see + tidesdb_commit). We reset it to get a fresh MVCC snapshot at + current-transaction-start. This avoids the expensive + free+begin cycle while ensuring we see the latest data. + The bulk-insert path already uses commit+reset successfully. + Only reset when needs_reset is true (set after real commit/ + rollback) to preserve snapshot within multi-statement txns. */ + int rrc = tidesdb_txn_reset(trx->txn, iso); + if (rrc != TDB_SUCCESS) + { + /* Reset failed -- we fall back to free + begin. Surface the + failure so we can spot regressions in txn recycling instead + of silently degrading to per-statement free+begin. */ + sql_print_warning( + "[TIDESDB] tidesdb_txn_reset failed (rc=%d), falling back to " + "free+begin -- expect higher per-statement overhead until " + "this is investigated", + rrc); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + int rc = tidesdb_txn_begin_with_isolation(tdb_global, iso, &trx->txn); + if (rc != TDB_SUCCESS) + { + (void)tdb_rc_to_ha(rc, "get_or_create_trx txn_begin(reset_fallback)"); + return NULL; + } + } + trx->needs_reset = false; + trx->isolation_level = iso; + trx->txn_generation++; + } + return trx; + } + + /* The trx struct owns a std::vector (fts_meta_pending), so it must be + constructed and destroyed properly. Switching from MY_ZEROFILL/my_free + to new/delete runs the std::vector's ctor/dtor and gives every field + its default value via the header's member initialisers. */ + trx = new tidesdb_trx_t{}; + if (!trx) return NULL; + + int rc = tidesdb_txn_begin_with_isolation(tdb_global, iso, &trx->txn); + if (rc != TDB_SUCCESS) + { + delete trx; + (void)tdb_rc_to_ha(rc, "get_or_create_trx txn_begin(new)"); + return NULL; + } + trx->isolation_level = iso; + trx->txn_generation = 1; + thd_set_ha_data(thd, hton, trx); + return trx; +} + +/* ******************** Handlerton transaction callbacks ******************** */ + +/* Maximum length of a TidesDB savepoint name, including the trailing NUL. + Names are synthesized via TIDESDB_SAVEPOINT_NAME_FMT below; 32 bytes + fits the decoded pointer plus prefix on all supported platforms. */ +static constexpr uint TIDESDB_SAVEPOINT_NAME_MAX = 32; +/* Format used to synthesize a unique savepoint name for the TidesDB + transaction layer. The pointer to the SQL-layer savepoint slot is + the only handle we have that survives across the set/rollback/release + callbacks, so we encode it as the engine-level savepoint name. */ +static constexpr const char TIDESDB_SAVEPOINT_NAME_FMT[] = "sv_%p"; + +struct tidesdb_savepoint_t +{ + char name[TIDESDB_SAVEPOINT_NAME_MAX]; +}; + +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_savepoint_set(THD *thd, void *sv) +#else +static int tidesdb_savepoint_set(handlerton *, THD *thd, void *sv) +#endif +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (!trx || !trx->txn || !sv) return 0; + + tidesdb_savepoint_t *sp = (tidesdb_savepoint_t *)sv; + snprintf(sp->name, sizeof(sp->name), TIDESDB_SAVEPOINT_NAME_FMT, sv); + + int rc = tidesdb_txn_savepoint(trx->txn, sp->name); + if (rc == TDB_SUCCESS) return 0; + return tdb_rc_to_ha(rc, "savepoint_set"); +} + +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_savepoint_rollback(THD *thd, void *sv) +#else +static int tidesdb_savepoint_rollback(handlerton *, THD *thd, void *sv) +#endif +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (!trx || !trx->txn || !sv) return 0; + + tidesdb_savepoint_t *sp = (tidesdb_savepoint_t *)sv; + if (!sp->name[0]) snprintf(sp->name, sizeof(sp->name), TIDESDB_SAVEPOINT_NAME_FMT, sv); + + int rc = tidesdb_txn_rollback_to_savepoint(trx->txn, sp->name); + if (rc == TDB_SUCCESS) + { + /* The TidesDB library may drop the savepoint as part of the rollback. + SQL semantics require the savepoint to still exist after rollback, + so we re-create it here to allow RELEASE SAVEPOINT to succeed. */ + (void)tidesdb_txn_savepoint(trx->txn, sp->name); + return 0; + } + if (rc == TDB_ERR_NOT_FOUND) return HA_ERR_NO_SAVEPOINT; + return tdb_rc_to_ha(rc, "savepoint_rollback"); +} + +#if MYSQL_VERSION_ID >= 110800 +static bool tidesdb_savepoint_rollback_can_release_mdl(THD *) +#else +static bool tidesdb_savepoint_rollback_can_release_mdl(handlerton *, THD *) +#endif +{ + return true; +} + +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_savepoint_release(THD *thd, void *sv) +#else +static int tidesdb_savepoint_release(handlerton *, THD *thd, void *sv) +#endif +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (!trx || !trx->txn || !sv) return 0; + + tidesdb_savepoint_t *sp = (tidesdb_savepoint_t *)sv; + if (!sp->name[0]) snprintf(sp->name, sizeof(sp->name), TIDESDB_SAVEPOINT_NAME_FMT, sv); + + int rc = tidesdb_txn_release_savepoint(trx->txn, sp->name); + if (rc == TDB_SUCCESS) return 0; + if (rc == TDB_ERR_NOT_FOUND) return HA_ERR_NO_SAVEPOINT; + return tdb_rc_to_ha(rc, "savepoint_release"); +} + +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_commit(THD *thd, bool all) +#else +static int tidesdb_commit(handlerton *, THD *thd, bool all) +#endif +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (!trx || !trx->txn) + { + return 0; + } + + /* We determine whether this is the final commit for the transaction. + all=true -> explicit COMMIT or transaction-level end + all=false -> statement-level; only a real commit when autocommit */ + bool is_real_commit = all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN); + + if (!is_real_commit) + { + /* Statement-level commit inside a multi-statement transaction. + Defer the actual commit -- writes stay buffered in the txn, + avoiding expensive txn_begin + commit per statement. + + tidesdb_txn_savepoint() deep-copies the entire + write-set (malloc+memcpy for every key/value). For a txn + with N ops across S statements, total copy cost is + O(S * N * avg_kv_size) -- quadratic and devastating for + multi-statement OLTP transactions. + + We skip the per-statement savepoint entirely. This means + statement-level rollback inside BEGIN...COMMIT falls back to + full transaction rollback (same as many simple SE's). + The trade-off is a statement failure aborts the entire txn + instead of undoing just that statement. For OLTP this is + acceptable since the client will retry the whole transaction + anyway after a conflict/error. */ + return 0; + } + + /* We must release any active statement savepoint before final commit/rollback. + Savepoints must be explicitly released before txn_commit. */ + if (trx->stmt_savepoint_active) + { + tidesdb_txn_release_savepoint(trx->txn, "stmt"); + trx->stmt_savepoint_active = false; + } + + /* Real commit -- flush to storage. + After a successful commit, we keep the txn object alive and let + get_or_create_trx() call tidesdb_txn_reset() to get a fresh + snapshot. This avoids the expensive free+begin cycle on every + autocommit statement (saves malloc/free + internal buffer + reallocation). The bulk-insert path already uses commit+reset + successfully, so the pattern is proven safe. + If commit fails, fall back to rollback+free. */ + if (trx->dirty) + { + /* Fold the per-txn FTS meta deltas into this same txn before it + commits so the meta update is atomic with the row writes that + produced it. */ + int frc = flush_trx_fts_meta_pending(thd, trx); + if (frc != TDB_SUCCESS) + { + sql_print_error( + "[TIDESDB] hton_commit: flush_trx_fts_meta_pending returned %d (gen=%lu)", frc, + (unsigned long)trx->txn_generation); + tidesdb_txn_rollback(trx->txn); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + trx->txn_generation++; + trx->dirty = false; + trx->stmt_savepoint_active = false; + row_locks_release_all(trx); + return tdb_rc_to_ha(frc, "hton_commit fts_meta_flush"); + } + + int rc = tdb_txn_commit_blocking(thd, trx->txn); + if (rc != TDB_SUCCESS) + { + /* Only log truly unexpected errors (not transient conflicts). */ + if (rc != TDB_ERR_CONFLICT && rc != TDB_ERR_LOCKED && rc != TDB_ERR_MEMORY_LIMIT && + rc != TDB_ERR_BUSY) + sql_print_error( + "[TIDESDB] hton_commit: tidesdb_txn_commit returned %d " + "(dirty=%d gen=%lu)", + rc, trx->dirty, (unsigned long)trx->txn_generation); + tidesdb_txn_rollback(trx->txn); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + trx->txn_generation++; + trx->dirty = false; + trx->stmt_savepoint_active = false; + row_locks_release_all(trx); + return tdb_rc_to_ha(rc, "hton_commit"); + } + /* We keep txn alive for reuse via txn_reset on next use. */ + trx->txn_generation++; + trx->needs_reset = true; + } + else + { + /* Read-only transaction -- we rollback, keep alive for reuse. */ + trx->fts_meta_pending.clear(); + trx->fts_meta_dirty = false; + tidesdb_txn_rollback(trx->txn); + trx->txn_generation++; + trx->needs_reset = true; + } + trx->dirty = false; + trx->stmt_savepoint_active = false; + row_locks_release_all(trx); + return 0; +} + +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_rollback(THD *thd, bool all) +#else +static int tidesdb_rollback(handlerton *, THD *thd, bool all) +#endif +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (!trx || !trx->txn) return 0; + + bool is_real_rollback = all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN); + + if (!is_real_rollback) + { + /* Statement-level rollback inside a multi-statement transaction. + Without per-statement savepoints (see tidesdb_commit note), + we fall through to full transaction rollback. This is the + same behavior as many simple storage engines and is correct -- + OLTP clients retry the entire transaction after any error. */ + } + + if (trx->stmt_savepoint_active) + { + tidesdb_txn_release_savepoint(trx->txn, "stmt"); + trx->stmt_savepoint_active = false; + } + + /* The accumulated FTS meta deltas track the rows being rolled back, + so discard them along with the txn's other write state. */ + trx->fts_meta_pending.clear(); + trx->fts_meta_dirty = false; + + /* Full rollback -- we keep txn alive for reuse via reset on next use. */ + tidesdb_txn_rollback(trx->txn); + trx->txn_generation++; + trx->needs_reset = true; + trx->dirty = false; + trx->stmt_savepoint_active = false; + row_locks_release_all(trx); + return 0; +} + +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_close_connection(THD *thd) +#else +static int tidesdb_close_connection(handlerton *, THD *thd) +#endif +{ + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (trx) + { + row_locks_release_all(trx); + if (trx->txn) + { + tidesdb_txn_rollback(trx->txn); + tidesdb_txn_free(trx->txn); + } + delete trx; + thd_set_ha_data(thd, tidesdb_hton, NULL); + } + return 0; +} + +/* + START TRANSACTION WITH CONSISTENT SNAPSHOT callback. + Eagerly creates a TidesDB transaction so the snapshot sequence number + is captured now, not lazily at first data access. Without this, rows + committed by other connections between START TRANSACTION and the first + SELECT would be visible. + + Uses the session's isolation level (SET TRANSACTION ISOLATION LEVEL) + rather than hard-coding REPEATABLE_READ. Falls back to RR if the + session is at the default. +*/ +#if MYSQL_VERSION_ID >= 110800 +static int tidesdb_start_consistent_snapshot(THD *thd) +#else +static int tidesdb_start_consistent_snapshot(handlerton *, THD *thd) +#endif +{ + /* START TRANSACTION WITH CONSISTENT SNAPSHOT explicitly requests a + point-in-time snapshot. Always use at least SNAPSHOT isolation + so the snapshot persists for the entire transaction, regardless of + the session's default isolation level (e.g. READ_COMMITTED would + refresh the snapshot on each read, violating CONSISTENT_SNAPSHOT + semantics). */ + tidesdb_isolation_level_t iso = resolve_effective_isolation(thd, TDB_ISOLATION_REPEATABLE_READ); + if (iso < TDB_ISOLATION_SNAPSHOT) iso = TDB_ISOLATION_SNAPSHOT; + tidesdb_trx_t *trx = get_or_create_trx(thd, tidesdb_hton, iso); + if (!trx) return 1; + + /* We register at both statement and transaction level so the server + knows TidesDB is participating in this BEGIN block. */ + trans_register_ha(thd, false, tidesdb_hton, 0); + trans_register_ha(thd, true, tidesdb_hton, 0); + return 0; +} + +/* ******************** SHOW ENGINE TIDESDB STATUS ******************** */ + +static bool tidesdb_show_status(handlerton *hton, THD *thd, stat_print_fn *print, + enum ha_stat_type stat) +{ + if (stat != HA_ENGINE_STATUS) return false; + if (!tdb_global) return false; + + tidesdb_refresh_status_vars(); + + /* Database-level stats */ + tidesdb_db_stats_t db_st; + memset(&db_st, 0, sizeof(db_st)); + tidesdb_get_db_stats(tdb_global, &db_st); + + /* Cache stats */ + tidesdb_cache_stats_t cache_st; + memset(&cache_st, 0, sizeof(cache_st)); + tidesdb_get_cache_stats(tdb_global, &cache_st); + + /* Output buffer for SHOW ENGINE TIDESDB STATUS. 8 KiB is enough to + hold the fixed-format sections plus an optional object-store block + and the last-conflict line without truncating. */ + static constexpr uint TIDESDB_STATUS_BUF_LEN = 8192; + char buf[TIDESDB_STATUS_BUF_LEN]; + int pos = 0; + + pos += snprintf(buf + pos, sizeof(buf) - pos, + "================== TidesDB Engine Status ==================\n"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Data directory: %s\n", tdb_path.c_str()); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Unified memtable: %s\n", + srv_unified_memtable ? "ON" : "OFF"); + pos += + snprintf(buf + pos, sizeof(buf) - pos, "Column families: %d\n", db_st.num_column_families); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Global sequence: %lu\n", + (unsigned long)db_st.global_seq); + pos += snprintf(buf + pos, sizeof(buf) - pos, "\n--- Memory ---\n"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Total system memory: %lu MB\n", + (unsigned long)(db_st.total_memory / (1024 * 1024))); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Resolved memory limit: %lu MB\n", + (unsigned long)(db_st.resolved_memory_limit / (1024 * 1024))); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Memory pressure level: %d\n", + db_st.memory_pressure_level); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Total memtable bytes: %ld\n", + (long)db_st.total_memtable_bytes); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Transaction memory bytes: %ld\n", + (long)db_st.txn_memory_bytes); + pos += snprintf(buf + pos, sizeof(buf) - pos, "\n--- Storage ---\n"); + pos += + snprintf(buf + pos, sizeof(buf) - pos, "Total SSTables: %d\n", db_st.total_sstable_count); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Open SSTable handles: %d\n", + db_st.num_open_sstables); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Total data size: %lu bytes\n", + (unsigned long)db_st.total_data_size_bytes); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Immutable memtables: %d\n", + db_st.total_immutable_count); + pos += snprintf(buf + pos, sizeof(buf) - pos, "\n--- Background ---\n"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Flush pending: %d\n", db_st.flush_pending_count); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Flush queue size: %lu\n", + (unsigned long)db_st.flush_queue_size); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Compaction queue size: %lu\n", + (unsigned long)db_st.compaction_queue_size); + pos += snprintf(buf + pos, sizeof(buf) - pos, "\n--- Block Cache ---\n"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Enabled: %s\n", cache_st.enabled ? "YES" : "NO"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Entries: %lu\n", + (unsigned long)cache_st.total_entries); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Size: %lu bytes\n", + (unsigned long)cache_st.total_bytes); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Hits: %lu\n", (unsigned long)cache_st.hits); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Misses: %lu\n", (unsigned long)cache_st.misses); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Hit rate: %.1f%%\n", + cache_st.hit_rate * PERCENT_SCALE); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Partitions: %lu\n", + (unsigned long)cache_st.num_partitions); + + /* Tombstone observability. Aggregates are populated by the + tidesdb_refresh_status_vars call at the top of this function, which + walks all CFs once. */ + pos += snprintf(buf + pos, sizeof(buf) - pos, "\n--- Tombstones ---\n"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Total tombstones: %ld\n", + (long)srv_stat_total_tombstones); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Tombstone ratio: %.2f%%\n", + srv_stat_tombstone_ratio * PERCENT_SCALE); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Worst SSTable density: %.2f%% at level %ld\n", + srv_stat_max_sst_density * PERCENT_SCALE, (long)srv_stat_max_sst_density_level); + + /* Object store stats */ + if (db_st.object_store_enabled) + { + pos += snprintf(buf + pos, sizeof(buf) - pos, "\n--- Object Store ---\n"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Connector: %s\n", + db_st.object_store_connector ? db_st.object_store_connector : "unknown"); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Total uploads: %lu\n", + (unsigned long)db_st.total_uploads); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Upload failures: %lu\n", + (unsigned long)db_st.total_upload_failures); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Upload queue depth: %lu\n", + (unsigned long)db_st.upload_queue_depth); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Local cache: %lu / %lu bytes (%d files)\n", + (unsigned long)db_st.local_cache_bytes_used, + (unsigned long)db_st.local_cache_bytes_max, db_st.local_cache_num_files); + pos += snprintf(buf + pos, sizeof(buf) - pos, "Replica mode: %s\n", + db_st.replica_mode ? "ON" : "OFF"); + } + + /* Last conflict info */ + mysql_mutex_lock(&last_conflict_mutex); + if (last_conflict_info[0]) + pos += + snprintf(buf + pos, sizeof(buf) - pos, "\n--- Conflicts ---\n%s\n", last_conflict_info); + mysql_mutex_unlock(&last_conflict_mutex); + + static constexpr const char TIDESDB_ENGINE_NAME[] = "TIDESDB"; + static constexpr uint TIDESDB_ENGINE_NAME_LEN = sizeof(TIDESDB_ENGINE_NAME) - 1; + return print(thd, TIDESDB_ENGINE_NAME, TIDESDB_ENGINE_NAME_LEN, "", 0, buf, (size_t)pos); +} + +/* ******************** Schema discovery (object store mode) ******************** */ +/* + The __tidesql_schema column family stores .frm binaries so that replicas + can discover table definitions via the handlerton discovery API. On + local-only mode schema_cf is NULL and all helpers are no-ops. +*/ + +/* + Build a schema CF key from db + table LEX_CSTRINGs. + Format-- "db_name\0table_name" (null byte separator, no trailing null). +*/ +static std::string schema_cf_key(const LEX_CSTRING &db, const LEX_CSTRING &tbl) +{ + std::string k; + k.reserve(db.length + sizeof(SCHEMA_CF_KEY_SEP) + tbl.length); + k.append(db.str, db.length); + k.push_back(SCHEMA_CF_KEY_SEP); + k.append(tbl.str, tbl.length); + return k; +} + +/* + Build a schema CF key from a MariaDB table path (e.g. "./db/table"). + Extracts the db and table components using the same logic as path_to_cf_name. +*/ +static std::string schema_cf_key_from_path(const char *path) +{ + std::string p(path); + + if (p.size() >= MARIADB_REL_PATH_PREFIX_LEN && + p.compare(0, MARIADB_REL_PATH_PREFIX_LEN, MARIADB_REL_PATH_PREFIX) == 0) + p = p.substr(MARIADB_REL_PATH_PREFIX_LEN); + + size_t last_slash = p.rfind('/'); + if (last_slash == std::string::npos) + { + /* No slashes -- we treat entire path as table name with empty db */ + std::string k; + k.push_back(SCHEMA_CF_KEY_SEP); + k.append(p); + return k; + } + + std::string tblname = p.substr(last_slash + 1); + + size_t prev_slash = (last_slash > 0) ? p.rfind('/', last_slash - 1) : std::string::npos; + std::string dbname; + if (prev_slash == std::string::npos) + dbname = p.substr(0, last_slash); + else + dbname = p.substr(prev_slash + 1, last_slash - prev_slash - 1); + + std::string k; + k.reserve(dbname.size() + sizeof(SCHEMA_CF_KEY_SEP) + tblname.size()); + k.append(dbname); + k.push_back(SCHEMA_CF_KEY_SEP); + k.append(tblname); + return k; +} + +/* + Store a .frm image in the schema CF. + + When frm_data/frm_len are provided the image is used directly (this is + the normal path during CREATE TABLE -- MariaDB skips writing .frm to + disk when discover_table is registered on the handlerton). + + When frm_data is NULL, the .frm is read from disk (ALTER TABLE path + where MariaDB writes the updated .frm before calling commit). + + No-op when schema_cf is NULL (local-only mode). +*/ +static int schema_cf_store_frm(const char *path, const uchar *frm_data = NULL, size_t frm_len = 0) +{ + /* Replica mode is read-only against the object store. Even a single + successful insert into the schema CF lands in the unified memtable + and gets flushed to a new SSTable when the bootstrap mariadbd + drains on shutdown, which then triggers a compaction whose + MANIFEST upload overwrites the primary's authoritative state. + Refuse all schema writes here so the bucket stays clean. */ + if (srv_replica_mode) return 0; + if (!schema_cf) return 0; + + uchar *alloc_buf = NULL; + + if (!frm_data) + { + char frm_path[FN_REFLEN]; + fn_format(frm_path, path, "", reg_ext, MY_UNPACK_FILENAME | MY_APPEND_EXT); + + MY_STAT st; + if (!my_stat(frm_path, &st, MYF(0))) return 0; /* .frm not on disk -- not fatal */ + + File fd = my_open(frm_path, O_RDONLY, MYF(0)); + if (fd < 0) return 0; + + frm_len = (size_t)st.st_size; + alloc_buf = (uchar *)my_malloc(PSI_NOT_INSTRUMENTED, frm_len, MYF(0)); + if (!alloc_buf) + { + my_close(fd, MYF(0)); + return -1; + } + + if (my_read(fd, alloc_buf, frm_len, MYF(MY_NABP)) != 0) + { + my_free(alloc_buf); + my_close(fd, MYF(0)); + return 0; + } + my_close(fd, MYF(0)); + frm_data = alloc_buf; + } + + std::string key = schema_cf_key_from_path(path); + + tidesdb_txn_t *txn = NULL; + int rc = tidesdb_txn_begin(tdb_global, &txn); + if (rc == TDB_SUCCESS) + { + rc = tidesdb_txn_put(txn, schema_cf, (const uint8_t *)key.data(), key.size(), frm_data, + frm_len, TIDESDB_TTL_NONE); + if (rc == TDB_SUCCESS) + rc = tidesdb_txn_commit(txn); + else + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + } + + if (alloc_buf) my_free(alloc_buf); + return (rc == TDB_SUCCESS) ? 0 : -1; +} + +/* + Remove a table's .frm entry from the schema CF on DROP TABLE. +*/ +static void schema_cf_delete(const char *path) +{ + /* See the rationale in schema_cf_store_frm -- replica writes must not + reach the unified memtable or the bootstrap mariadbd's shutdown + drain will flush + compact + upload a MANIFEST that overwrites the + primary's. */ + if (srv_replica_mode) return; + if (!schema_cf) return; + + std::string key = schema_cf_key_from_path(path); + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) == TDB_SUCCESS) + { + tidesdb_txn_delete(txn, schema_cf, (const uint8_t *)key.data(), key.size()); + tidesdb_txn_commit(txn); + tidesdb_txn_free(txn); + } +} + +/* + Remove every schema CF entry belonging to a dropped database. + Keys are "db_name\0table_name" so we iterate the CF and delete entries + whose prefix matches. No-op in local-only mode (schema_cf is NULL). +*/ +static void schema_cf_delete_db(const std::string &db_name) +{ + /* Same rationale as schema_cf_store_frm -- never let a replica land + writes in the unified memtable. */ + if (srv_replica_mode) return; + if (!schema_cf || db_name.empty()) return; + + /* Match keys beginning with "db_name". */ + std::string prefix = db_name; + prefix.push_back(SCHEMA_CF_KEY_SEP); + + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return; + + tidesdb_iter_t *it = NULL; + if (tdb_iter_new_blocking(current_thd, txn, schema_cf, &it) != TDB_SUCCESS) + { + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + return; + } + + std::vector to_delete; + tidesdb_iter_seek(it, (const uint8_t *)prefix.data(), prefix.size()); + while (tidesdb_iter_valid(it)) + { + uint8_t *k = NULL; + size_t klen = 0; + if (tidesdb_iter_key(it, &k, &klen) != TDB_SUCCESS) break; + if (klen < prefix.size() || memcmp(k, prefix.data(), prefix.size()) != 0) break; + to_delete.emplace_back((const char *)k, klen); + tidesdb_iter_next(it); + } + tidesdb_iter_free(it); + + for (const auto &k : to_delete) + tidesdb_txn_delete(txn, schema_cf, (const uint8_t *)k.data(), k.size()); + + if (!to_delete.empty()) + tidesdb_txn_commit(txn); + else + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); +} + +/* + Rename a table's schema CF entry (delete old key, insert under new key). + Called from rename_table(). +*/ +static void schema_cf_rename(const char *from, const char *to) +{ + /* Same rationale as schema_cf_store_frm -- never let a replica land + writes in the unified memtable. */ + if (srv_replica_mode) return; + if (!schema_cf) return; + + std::string old_key = schema_cf_key_from_path(from); + std::string new_key = schema_cf_key_from_path(to); + + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return; + + uint8_t *val = NULL; + size_t val_len = 0; + int rc = tidesdb_txn_get(txn, schema_cf, (const uint8_t *)old_key.data(), old_key.size(), &val, + &val_len); + if (rc == TDB_SUCCESS && val) + { + tidesdb_txn_put(txn, schema_cf, (const uint8_t *)new_key.data(), new_key.size(), val, + val_len, TIDESDB_TTL_NONE); + tidesdb_txn_delete(txn, schema_cf, (const uint8_t *)old_key.data(), old_key.size()); + tidesdb_txn_commit(txn); + tidesdb_free(val); + } + else + { + tidesdb_txn_rollback(txn); + if (val) tidesdb_free(val); + + /* We fallback, old key missing? we read .frm from disk at new path */ + schema_cf_store_frm(to); + } + + tidesdb_txn_free(txn); +} + +static void schema_cf_ensure_databases(); + +/* + Handlerton discover_table callback. + Called when MariaDB cannot find a .frm file on disk for a TidesDB table. + Reads the .frm binary from the schema CF and initializes the TABLE_SHARE. +*/ +static int tidesdb_discover_table(handlerton *, THD *thd, TABLE_SHARE *share) +{ + if (!schema_cf) return HA_ERR_NO_SUCH_TABLE; + + std::string key = schema_cf_key(share->db, share->table_name); + + tidesdb_txn_t *txn = NULL; + int rc = tidesdb_txn_begin(tdb_global, &txn); + if (rc != TDB_SUCCESS) return HA_ERR_NO_SUCH_TABLE; + + uint8_t *val = NULL; + size_t val_len = 0; + /* Wrap in the backpressure helper so reader-fd starvation or memtable + backpressure waits instead of immediately reporting "table missing". + Returning HA_ERR_NO_SUCH_TABLE for a transient BUSY puts MariaDB + in the discover loop the comment below warns about. */ + rc = tdb_with_backpressure_wait(thd, + [&]() + { + return tidesdb_txn_get(txn, schema_cf, + (const uint8_t *)key.data(), + key.size(), &val, &val_len); + }); + tidesdb_txn_rollback(txn); /* read-only, no commit needed */ + tidesdb_txn_free(txn); + + if (rc == TDB_ERR_NOT_FOUND || !val) return HA_ERR_NO_SUCH_TABLE; + if (rc != TDB_SUCCESS) + { + /* IO / corruption / persistent BUSY surfaces as HA_ERR_CRASHED so + the operator sees the real cause instead of an opaque "table + not found". */ + if (val) tidesdb_free(val); + return tdb_rc_to_ha(rc, "tidesdb_discover_table"); + } + + /* We ensure the database directory exists. The primary may have created + this database after the replica started, and schema_cf_ensure_databases() + only runs at plugin init. A single stat() + conditional mkdir(). */ + { + char db_dir[FN_REFLEN]; + size_t dh_len = strlen(mysql_real_data_home); + snprintf(db_dir, sizeof(db_dir), "%s%s%.*s", mysql_real_data_home, + (dh_len > 0 && mysql_real_data_home[dh_len - 1] != '/') ? "/" : "", + (int)share->db.length, share->db.str); + MY_STAT st; + if (!my_stat(db_dir, &st, MYF(0))) my_mkdir(db_dir, TIDESDB_DB_DIR_MODE, MYF(0)); + } + + /* We verify the data CF actually exists before returning the .frm. + If the .frm is in the schema CF but the data CF hasn't been synced + yet (e.g. replica hasn't downloaded it from S3), returning the .frm + would cause handler::open() to fail with HA_ERR_NO_SUCH_TABLE. + MariaDB then retries discovery in an infinite loop (delete .frm -> + discover -> write .frm -> open fails -> delete .frm -> ...). */ + { + std::string cf_name = std::string(share->db.str, share->db.length) + CF_DB_TABLE_SEP + + std::string(share->table_name.str, share->table_name.length); + if (!tidesdb_get_column_family(tdb_global, cf_name.c_str())) + { + tidesdb_free(val); + return HA_ERR_NO_SUCH_TABLE; + } + } + + /* We parse .frm binary into TABLE_SHARE. + write=true causes MariaDB to cache the .frm on disk so subsequent + opens skip discovery. */ + rc = share->init_from_binary_frm_image(thd, true, val, val_len); + + tidesdb_free(val); + return rc; +} + +/* + Handlerton discover_table_names callback. + Lists all TidesDB tables in a given database by scanning the schema CF + for keys with the matching "db\0" prefix. +*/ +static int tidesdb_discover_table_names(handlerton *, const LEX_CSTRING *db, MY_DIR *, + handlerton::discovered_list *result) +{ + if (!schema_cf) return 0; + + /* We ensure database directories are up-to-date. Picks up databases + created by the primary after this replica started. */ + schema_cf_ensure_databases(); + + std::string prefix; + prefix.reserve(db->length + sizeof(SCHEMA_CF_KEY_SEP)); + prefix.append(db->str, db->length); + prefix.push_back(SCHEMA_CF_KEY_SEP); + + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return 0; + + tidesdb_iter_t *iter = NULL; + if (tdb_iter_new_blocking(current_thd, txn, schema_cf, &iter) != TDB_SUCCESS || !iter) + { + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + return 0; + } + + tidesdb_iter_seek(iter, (const uint8_t *)prefix.data(), prefix.size()); + while (tidesdb_iter_valid(iter)) + { + uint8_t *kp = NULL; + size_t klen = 0; + if (tidesdb_iter_key(iter, &kp, &klen) != TDB_SUCCESS || !kp) break; + + if (klen < prefix.size() || memcmp(kp, prefix.data(), prefix.size()) != 0) break; + + /* Table name is everything after the "db\0" prefix */ + const char *tname = (const char *)kp + prefix.size(); + size_t tlen = klen - prefix.size(); + result->add_table(tname, tlen); + + tidesdb_iter_next(iter); + } + + tidesdb_iter_free(iter); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + return 0; +} + +/* + Handlerton discover_table_existence callback. + Returns 1 if the table has an entry in the schema CF, 0 otherwise. +*/ +static int tidesdb_discover_table_existence(handlerton *, const char *db, const char *table_name) +{ + if (!schema_cf) return 0; + + /* Ensure database directories are up-to-date for replica discovery. */ + schema_cf_ensure_databases(); + + LEX_CSTRING db_lex = {db, strlen(db)}; + LEX_CSTRING tbl_lex = {table_name, strlen(table_name)}; + std::string key = schema_cf_key(db_lex, tbl_lex); + + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return 0; + + uint8_t *val = NULL; + size_t val_len = 0; + /* Same backpressure rationale as tidesdb_discover_table-- a transient + BUSY answer here lies to the SQL layer about the table's existence. */ + int rc = tdb_with_backpressure_wait(current_thd, + [&]() + { + return tidesdb_txn_get(txn, schema_cf, + (const uint8_t *)key.data(), + key.size(), &val, &val_len); + }); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + if (val) tidesdb_free(val); + + return (rc == TDB_SUCCESS) ? 1 : 0; +} + +/* + Scan the schema CF for all unique database names and create any missing + database directories under mysql_real_data_home. This ensures that + replicas (which receive table definitions via S3) have the database + directory present so MariaDB will call discover_table_names for them. + Without the directory, MariaDB doesn't know the database exists and + never asks TidesDB about its tables. +*/ +static void schema_cf_ensure_databases() +{ + if (!schema_cf) return; + + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return; + + tidesdb_iter_t *iter = NULL; + if (tdb_iter_new_blocking(current_thd, txn, schema_cf, &iter) != TDB_SUCCESS || !iter) + { + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + return; + } + + std::unordered_set seen_dbs; + + tidesdb_iter_seek_to_first(iter); + while (tidesdb_iter_valid(iter)) + { + uint8_t *kp = NULL; + size_t klen = 0; + if (tidesdb_iter_key(iter, &kp, &klen) != TDB_SUCCESS || !kp) break; + + /* Key format-- "db_nametable_name" -- + we find the separator */ + const char *kstr = (const char *)kp; + size_t sep = 0; + for (; sep < klen; sep++) + { + if (kstr[sep] == SCHEMA_CF_KEY_SEP) break; + } + if (sep > 0 && sep < klen) + { + std::string dbname(kstr, sep); + if (seen_dbs.insert(dbname).second) + { + char db_dir[FN_REFLEN]; + size_t dh_len = strlen(mysql_real_data_home); + snprintf(db_dir, sizeof(db_dir), "%s%s%s", mysql_real_data_home, + (dh_len > 0 && mysql_real_data_home[dh_len - 1] != '/') ? "/" : "", + dbname.c_str()); + + MY_STAT st; + if (!my_stat(db_dir, &st, MYF(0))) + { + if (my_mkdir(db_dir, TIDESDB_DB_DIR_MODE, MYF(0)) == 0) + sql_print_information( + "[TIDESDB] Created database directory '%s' for schema discovery", + dbname.c_str()); + } + } + } + + tidesdb_iter_next(iter); + } + + tidesdb_iter_free(iter); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); +} + +/* ******************** Plugin init / deinit ******************** */ + +static int tidesdb_hton_drop_table(handlerton *, const char *path); +static void tidesdb_hton_drop_database(handlerton *, char *path); +static bool tidesdb_hton_flush_logs(handlerton *); +static int tidesdb_hton_panic(handlerton *, enum ha_panic_function flag); +static void tidesdb_hton_pre_shutdown(void); +static void tidesdb_hton_kill_query(handlerton *, THD *thd, enum thd_kill_levels level); + +static int tidesdb_init_func(void *p) +{ + DBUG_ENTER("tidesdb_init_func"); + + tidesdb_hton = (handlerton *)p; + tidesdb_hton->create = tidesdb_create_handler; + tidesdb_hton->flags = 0; + tidesdb_hton->savepoint_offset = sizeof(tidesdb_savepoint_t); + tidesdb_hton->tablefile_extensions = ha_tidesdb_exts; + tidesdb_hton->table_options = tidesdb_table_option_list; + tidesdb_hton->field_options = tidesdb_field_option_list; + tidesdb_hton->index_options = tidesdb_index_option_list; + tidesdb_hton->drop_table = tidesdb_hton_drop_table; + tidesdb_hton->drop_database = tidesdb_hton_drop_database; + + /* Handlerton transaction callbacks -- one TidesDB txn per BEGIN..COMMIT */ + tidesdb_hton->commit = tidesdb_commit; + tidesdb_hton->rollback = tidesdb_rollback; + tidesdb_hton->close_connection = tidesdb_close_connection; + + tidesdb_hton->savepoint_set = tidesdb_savepoint_set; + tidesdb_hton->savepoint_rollback = tidesdb_savepoint_rollback; + tidesdb_hton->savepoint_rollback_can_release_mdl = tidesdb_savepoint_rollback_can_release_mdl; + tidesdb_hton->savepoint_release = tidesdb_savepoint_release; + tidesdb_hton->start_consistent_snapshot = tidesdb_start_consistent_snapshot; + tidesdb_hton->show_status = tidesdb_show_status; + + /* Durability / lifecycle / cancellation hooks. */ + tidesdb_hton->flush_logs = tidesdb_hton_flush_logs; + tidesdb_hton->panic = tidesdb_hton_panic; + tidesdb_hton->pre_shutdown = tidesdb_hton_pre_shutdown; + tidesdb_hton->kill_query = tidesdb_hton_kill_query; + + mysql_mutex_init(0, &last_conflict_mutex, MY_MUTEX_INIT_FAST); + + /* Size the lock table to 8 * hardware threads, clamped into a + sensible range. Below the floor the hash collisions hurt; above + the ceiling we just burn memory without buying contention relief. */ + { + unsigned int hw = std::thread::hardware_concurrency(); + if (hw == 0) hw = 8; + ulong desired = (ulong)hw * 8; + if (desired < ROW_LOCK_PARTITIONS_MIN) desired = ROW_LOCK_PARTITIONS_MIN; + if (desired > ROW_LOCK_PARTITIONS_MAX) desired = ROW_LOCK_PARTITIONS_MAX; + row_lock_partitions = desired; + } + + /* my_malloc returns only malloc-default alignment (8 or 16 bytes), so + a struct declared alignas(64) can land misaligned in the array and + any 16-byte SSE store the compiler emits against it segfaults. + posix_memalign / _aligned_malloc give us the alignment the struct + actually requires; the matching free in deinit must use the same + allocator family. */ + { + size_t sz = (size_t)row_lock_partitions * sizeof(tdb_lock_partition_t); + void *p = NULL; +#ifdef _WIN32 + p = _aligned_malloc(sz, alignof(tdb_lock_partition_t)); +#else + if (posix_memalign(&p, alignof(tdb_lock_partition_t), sz) != 0) p = NULL; +#endif + if (p) + { + memset(p, 0, sz); + lock_partitions = (tdb_lock_partition_t *)p; + } + else + { + lock_partitions = NULL; + } + } + if (lock_partitions) + { + for (ulong i = 0; i < row_lock_partitions; i++) + { + mysql_mutex_init(0, &lock_partitions[i].mutex, MY_MUTEX_INIT_FAST); + lock_partitions[i].chain = NULL; + lock_partitions[i].freelist = NULL; + } + } + + /* Initialize FTS stop word set with defaults */ + mysql_rwlock_init(tdb_stopword_lock_key, &tdb_stopword_lock); + tdb_load_default_stopwords(); + sql_print_information("[TIDESDB] Loaded %zu default stop words", tdb_stopwords.size()); + + /* Initialize FTS blend chars */ + mysql_rwlock_init(tdb_blend_lock_key, &tdb_blend_lock); + tdb_rebuild_blend_map(srv_fts_blend_chars); + + /* We use tidesdb_data_home_dir if set, otherwise compute + a sibling directory of the MariaDB data directory. */ + if (srv_data_home_dir && srv_data_home_dir[0]) + { + tdb_path = srv_data_home_dir; + while (!tdb_path.empty() && tdb_path.back() == '/') tdb_path.pop_back(); + } + else + { + std::string data_home(mysql_real_data_home); + while (!data_home.empty() && data_home.back() == '/') data_home.pop_back(); + size_t slash_pos = data_home.rfind('/'); + if (slash_pos != std::string::npos) + tdb_path = data_home.substr(0, slash_pos + 1) + "tidesdb_data"; + else + tdb_path = "tidesdb_data"; + } + + static const int log_level_map[] = {TDB_LOG_DEBUG, TDB_LOG_INFO, TDB_LOG_WARN, + TDB_LOG_ERROR, TDB_LOG_FATAL, TDB_LOG_NONE}; + + tidesdb_config_t cfg = tidesdb_default_config(); + cfg.db_path = const_cast(tdb_path.c_str()); + cfg.num_flush_threads = (int)srv_flush_threads; + cfg.num_compaction_threads = (int)srv_compaction_threads; + cfg.log_level = (tidesdb_log_level_t)log_level_map[srv_log_level]; + /* The library caps concurrent flushes by config.max_concurrent_flushes + (default 4 in the library), independent of num_flush_threads, so + leaving the cap below the worker count would silently idle workers. + Default tidesdb_max_concurrent_flushes=0 means align the cap with + tidesdb_flush_threads so every worker can run. A non-zero user + value is honoured but warned when it leaves workers idle. */ + if (srv_max_concurrent_flushes == 0) + { + cfg.max_concurrent_flushes = (int)srv_flush_threads; + } + else + { + cfg.max_concurrent_flushes = (int)srv_max_concurrent_flushes; + if (srv_max_concurrent_flushes < srv_flush_threads) + sql_print_warning( + "[TIDESDB] tidesdb_max_concurrent_flushes=%lu is lower than " + "tidesdb_flush_threads=%lu, %lu flush worker(s) will remain idle. " + "Raise tidesdb_max_concurrent_flushes to at least %lu (or leave it " + "at 0 to align automatically) to use every configured worker", + srv_max_concurrent_flushes, srv_flush_threads, + srv_flush_threads - srv_max_concurrent_flushes, srv_flush_threads); + } + cfg.block_cache_size = (size_t)srv_block_cache_size; + cfg.max_open_sstables = (int)srv_max_open_sstables; + cfg.log_to_file = srv_log_to_file ? 1 : 0; + cfg.log_truncation_at = (size_t)srv_log_truncation_at; + cfg.max_memory_usage = (size_t)srv_max_memory_usage; + cfg.unified_memtable = srv_unified_memtable ? 1 : 0; + cfg.unified_memtable_write_buffer_size = (size_t)srv_unified_memtable_write_buffer_size; + cfg.unified_memtable_sync_mode = tdb_sync_mode_map[srv_unified_memtable_sync_mode]; + cfg.unified_memtable_sync_interval_us = (uint64_t)srv_unified_memtable_sync_interval; + cfg.unified_memtable_skip_list_max_level = (int)srv_unified_memtable_skip_list_max_level; + cfg.unified_memtable_skip_list_probability = (float)srv_unified_memtable_skip_list_probability; + + /* Object store connector setup */ + tidesdb_objstore_t *objstore_connector = NULL; + static tidesdb_objstore_config_t objstore_cfg; + + if (srv_object_store_backend == OBJSTORE_BACKEND_S3) + { +#ifdef TIDESDB_WITH_S3 + if (!srv_s3_endpoint || !srv_s3_bucket || !srv_s3_access_key || !srv_s3_secret_key) + { + sql_print_error( + "[TIDESDB] S3 backend requires s3_endpoint, s3_bucket, " + "s3_access_key, and s3_secret_key"); + DBUG_RETURN(1); + } + + /* Modern config-struct entry exposes the TLS + multipart knobs the + legacy positional create cannot. Zero-initialize then fill so any + field added by the library in the future stays at its + secure-default value until the plugin surfaces it. */ + tidesdb_objstore_s3_config_t s3cfg; + memset(&s3cfg, 0, sizeof(s3cfg)); + s3cfg.endpoint = srv_s3_endpoint; + s3cfg.bucket = srv_s3_bucket; + s3cfg.prefix = srv_s3_prefix; + s3cfg.access_key = srv_s3_access_key; + s3cfg.secret_key = srv_s3_secret_key; + s3cfg.region = srv_s3_region; + s3cfg.use_ssl = srv_s3_use_ssl ? 1 : 0; + s3cfg.use_path_style = srv_s3_path_style ? 1 : 0; + s3cfg.tls_ca_path = + (srv_s3_tls_ca_path && srv_s3_tls_ca_path[0]) ? srv_s3_tls_ca_path : NULL; + s3cfg.tls_insecure_skip_verify = srv_s3_tls_insecure_skip_verify ? 1 : 0; + s3cfg.multipart_threshold = (size_t)srv_s3_multipart_threshold; + s3cfg.multipart_part_size = (size_t)srv_s3_multipart_part_size; + + if (s3cfg.tls_insecure_skip_verify) + { + sql_print_warning( + "[TIDESDB] s3_tls_insecure_skip_verify is ON; the S3 endpoint's " + "TLS certificate is not validated. Use only for trusted test endpoints."); + } + + objstore_connector = tidesdb_objstore_s3_create_config(&s3cfg); + + if (!objstore_connector) + { + sql_print_error("[TIDESDB] Failed to create S3 connector for %s/%s", srv_s3_endpoint, + srv_s3_bucket); + DBUG_RETURN(1); + } + + sql_print_information("[TIDESDB] S3 connector created (endpoint=%s, bucket=%s, ssl=%s)", + srv_s3_endpoint, srv_s3_bucket, srv_s3_use_ssl ? "yes" : "no"); +#else + sql_print_error( + "[TIDESDB] S3 backend requested but TidesDB was not built with " + "-DTIDESDB_WITH_S3=ON"); + DBUG_RETURN(1); +#endif + } + + if (objstore_connector) + { + objstore_cfg = tidesdb_objstore_default_config(); + objstore_cfg.local_cache_max_bytes = (size_t)srv_objstore_local_cache_max; + objstore_cfg.wal_sync_threshold_bytes = (size_t)srv_objstore_wal_sync_threshold; + objstore_cfg.wal_sync_on_commit = srv_objstore_wal_sync_on_commit ? 1 : 0; + objstore_cfg.cache_on_read = srv_objstore_cache_on_read ? 1 : 0; + objstore_cfg.cache_on_write = srv_objstore_cache_on_write ? 1 : 0; + if (srv_objstore_max_concurrent_uploads > 0) + objstore_cfg.max_concurrent_uploads = (int)srv_objstore_max_concurrent_uploads; + if (srv_objstore_max_concurrent_downloads > 0) + objstore_cfg.max_concurrent_downloads = (int)srv_objstore_max_concurrent_downloads; + if (srv_objstore_multipart_threshold > 0) + objstore_cfg.multipart_threshold = (size_t)srv_objstore_multipart_threshold; + if (srv_objstore_multipart_part_size > 0) + objstore_cfg.multipart_part_size = (size_t)srv_objstore_multipart_part_size; + objstore_cfg.sync_manifest_to_object = srv_objstore_sync_manifest_to_object ? 1 : 0; + objstore_cfg.wal_upload_sync = srv_objstore_wal_upload_sync ? 1 : 0; + objstore_cfg.replicate_wal = srv_objstore_replicate_wal ? 1 : 0; + objstore_cfg.replica_mode = srv_replica_mode ? 1 : 0; + objstore_cfg.replica_sync_interval_us = (uint64_t)srv_replica_sync_interval; + objstore_cfg.replica_replay_wal = srv_objstore_replica_replay_wal ? 1 : 0; + + cfg.object_store = objstore_connector; + cfg.object_store_config = &objstore_cfg; + } + + int rc = tidesdb_open(&cfg, &tdb_global); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] Failed to open TidesDB at %s (err=%d)", tdb_path.c_str(), rc); + DBUG_RETURN(1); + } + + sql_print_information("[TIDESDB] TidesDB opened at %s", tdb_path.c_str()); + + /* Schema discovery CF -- created when object store is active so that + replicas can discover table definitions from the shared storage. */ + if (objstore_connector) + { + tidesdb_column_family_config_t schema_cfg = tidesdb_default_column_family_config(); + if (!tidesdb_get_column_family(tdb_global, SCHEMA_CF_NAME)) + tidesdb_create_column_family(tdb_global, SCHEMA_CF_NAME, &schema_cfg); + + schema_cf = tidesdb_get_column_family(tdb_global, SCHEMA_CF_NAME); + + if (schema_cf) + { + tidesdb_hton->discover_table = tidesdb_discover_table; + tidesdb_hton->discover_table_names = tidesdb_discover_table_names; + tidesdb_hton->discover_table_existence = tidesdb_discover_table_existence; + + /* We ensure database directories exist for all tables in the schema + CF so MariaDB discovers them (relevant for replicas). */ + schema_cf_ensure_databases(); + + sql_print_information("[TIDESDB] Schema discovery enabled (object store mode)"); + } + } + + DBUG_RETURN(0); +} + +/* + Handlerton-level FLUSH LOGS callback. Called on FLUSH LOGS and by + mariadb-backup before copying files so the on-disk WAL is a consistent + snapshot. With unified-memtable mode one sync covers all CFs. In + per-CF mode we sync the schema CF (always present in object-store + mode; otherwise we try the first registered CF). Returns false on + success (handlerton convention). +*/ +static bool tidesdb_hton_flush_logs(handlerton *) +{ + if (!tdb_global) return false; + + tidesdb_column_family_t *target = schema_cf; + if (!target) + { + char **names = NULL; + int count = 0; + if (tidesdb_list_column_families(tdb_global, &names, &count) == TDB_SUCCESS && names) + { + if (count > 0 && names[0]) target = tidesdb_get_column_family(tdb_global, names[0]); + for (int i = 0; i < count; i++) + if (names[i]) tidesdb_free(names[i]); + tidesdb_free(names); + } + } + if (!target) return false; /* empty database -- nothing to sync */ + + int rc = tidesdb_sync_wal(target); + if (rc != TDB_SUCCESS) + { + sql_print_warning("[TIDESDB] flush_logs: tidesdb_sync_wal failed (rc=%d)", rc); + return true; /* error */ + } + return false; +} + +/* + Handlerton-level panic callback. MariaDB calls this on signal-driven or + abnormal shutdown paths where tidesdb_deinit_func may not run. We only + react to HA_PANIC_CLOSE -- the other flags are legacy ISAM-era. +*/ +static int tidesdb_hton_panic(handlerton *, enum ha_panic_function flag) +{ + if (flag != HA_PANIC_CLOSE) return 0; + if (tdb_global) + { + tidesdb_close(tdb_global); + tdb_global = NULL; + schema_cf = NULL; + } + return 0; +} + +/* + Handlerton-level pre_shutdown callback. Runs before the deinit path so + background threads that still need a fully-functional server (compaction, + flush) get a clean signal to drain. We flush the unified WAL synchronously + and let tidesdb_close() in deinit finish the teardown. +*/ +static void tidesdb_hton_pre_shutdown(void) +{ + if (!tdb_global) return; + + /* Sync the unified WAL so durability is preserved if deinit is racing + a forced exit. The call is cheap when there's nothing to sync. */ + (void)tidesdb_hton_flush_logs(tidesdb_hton); +} + +/* + Handlerton-level kill_query callback. MariaDB calls this on KILL QUERY + and on connection shutdown. When the victim is blocked in + row_lock_acquire we wake it by broadcasting on the lock entry's cond, + and the wait loop sees thd_killed() on the next pass and bails out. + Spurious wake-ups are harmless because the wait loop re-checks + req->granted before exiting. + + trx->waiting_on_lock points directly at the lock entry, which is never + freed at runtime, so dereferencing it here is always safe. +*/ +static void tidesdb_hton_kill_query(handlerton *, THD *thd, enum thd_kill_levels) +{ + if (!thd) return; + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, tidesdb_hton); + if (!trx) return; + + tdb_row_lock_t *wait = trx->waiting_on_lock.load(std::memory_order_acquire); + if (!wait) return; + + /* We broadcast under the owning partition's mutex so the wake-up is + serialized against the holder's release path. Partition index is + cached on the lock entry so we don't have to recompute the hash. */ + if (lock_partitions && wait->partition < row_lock_partitions) + { + tdb_lock_partition_t *part = &lock_partitions[wait->partition]; + mysql_mutex_lock(&part->mutex); + mysql_cond_broadcast(&wait->cond); + mysql_mutex_unlock(&part->mutex); + } +} + +static int tidesdb_deinit_func(void *p) +{ + DBUG_ENTER("tidesdb_deinit_func"); + + schema_cf = NULL; + + if (tdb_global) + { + /* Opt-in fast-shutdown: cancel in-flight compactions and refuse new + background work so tidesdb_close does not block for minutes on a + multi-GB compaction backlog. Uncommitted compaction output is + discarded (inputs intact -- recovery is safe), but a mid-compaction + cancel can leave the object-store side with referenced-but-orphan + SSTables that confuse a syncing replica, so this is OFF by default + and tidesdb_close drains naturally. */ + if (srv_fast_shutdown) + { + int crc = tidesdb_cancel_background_work(tdb_global); + if (crc != TDB_SUCCESS) + sql_print_warning( + "[TIDESDB] tidesdb_cancel_background_work returned rc=%d at " + "shutdown; tidesdb_close may block waiting for in-flight work", + crc); + } + tidesdb_close(tdb_global); + tdb_global = NULL; + } + + mysql_mutex_destroy(&last_conflict_mutex); + mysql_rwlock_destroy(&tdb_stopword_lock); + mysql_rwlock_destroy(&tdb_blend_lock); + tdb_stopwords.clear(); + + if (lock_partitions) + { + for (ulong i = 0; i < row_lock_partitions; i++) + { + /* Free everything on the active chain and the freelist; both + lists thread through hash_next and the freelist holds slots + that were unlinked from the chain at release time. */ + for (tdb_row_lock_t *e = lock_partitions[i].chain; e;) + { + tdb_row_lock_t *next = e->hash_next; + mysql_cond_destroy(&e->cond); + my_free(e->pk); + my_free(e); + e = next; + } + for (tdb_row_lock_t *e = lock_partitions[i].freelist; e;) + { + tdb_row_lock_t *next = e->hash_next; + mysql_cond_destroy(&e->cond); + my_free(e->pk); + my_free(e); + e = next; + } + mysql_mutex_destroy(&lock_partitions[i].mutex); + } + /* Allocated via posix_memalign / _aligned_malloc in tidesdb_init_func; + pair with the matching free. */ +#ifdef _WIN32 + _aligned_free(lock_partitions); +#else + free(lock_partitions); +#endif + lock_partitions = NULL; + } + + sql_print_information("[TIDESDB] TidesDB closed"); + DBUG_RETURN(0); +} + +/* ******************** path_to_cf_name ******************** */ + +std::string ha_tidesdb::path_to_cf_name(const char *path) +{ + std::string p(path); + + if (p.size() >= MARIADB_REL_PATH_PREFIX_LEN && + p.compare(0, MARIADB_REL_PATH_PREFIX_LEN, MARIADB_REL_PATH_PREFIX) == 0) + p = p.substr(MARIADB_REL_PATH_PREFIX_LEN); + + size_t last_slash = p.rfind('/'); + if (last_slash == std::string::npos) return p; + + std::string tblname = p.substr(last_slash + 1); + + size_t prev_slash = (last_slash > 0) ? p.rfind('/', last_slash - 1) : std::string::npos; + std::string dbname; + if (prev_slash == std::string::npos) + dbname = p.substr(0, last_slash); + else + dbname = p.substr(prev_slash + 1, last_slash - prev_slash - 1); + + std::string result = dbname + CF_DB_TABLE_SEP + tblname; + + /* MariaDB temp table names embed '#'; substitute so the CF name + remains a valid identifier in the underlying TidesDB layer. */ + for (size_t i = 0; i < result.size(); i++) + if (result[i] == MARIADB_TEMP_NAME_MARKER) result[i] = MARIADB_TEMP_NAME_REPLACEMENT; + + return result; +} + +/* ******************** Factory / Constructor ******************** */ + +static handler *tidesdb_create_handler(handlerton *hton, TABLE_SHARE *table, MEM_ROOT *mem_root) +{ + return new (mem_root) ha_tidesdb(hton, table); +} + +ha_tidesdb::ha_tidesdb(handlerton *hton, TABLE_SHARE *table_arg) + : handler(hton, table_arg), + share(NULL), + stmt_txn(NULL), + stmt_txn_dirty(false), + scan_txn(NULL), + scan_iter(NULL), + scan_cf_(NULL), + scan_iter_cf_(NULL), + scan_iter_txn_(NULL), + scan_iter_txn_gen_(0), + idx_pk_exact_done_(false), + scan_dir_(DIR_NONE), + current_pk_len_(0), + idx_search_comp_len_(0), + dup_iter_count_(0), + cached_enc_key_ver_(0), + enc_key_ver_valid_(false), + cached_time_(0), + cached_time_valid_(false), + cached_sess_ttl_(0), + cached_skip_unique_(false), + cached_single_delete_primary_(false), + cached_thdvars_valid_(false), + stmt_has_write_lock_(false), + is_pk_(false), + scan_iter_last_err_(0), + scan_iter_last_err_cf_(NULL), + scan_iter_last_err_txn_(NULL), + has_blobs_(false), + encrypted_(false), + record1_lo_(NULL), + record1_hi_(NULL), + cached_sql_cmd_(0), + cached_is_autocommit_(false), + cached_stmt_shape_valid_(false), + cached_thd_(NULL), + cached_trx_(NULL), + in_bulk_insert_(false), + in_bulk_update_(false), + in_bulk_delete_(false), + bulk_insert_ops_(0), + cached_compact_after_range_delete_min_rows_(0), + bulk_delete_rows_(0), + mrr_custom_active_(false), + mrr_no_assoc_(false), + mrr_keyno_(MAX_KEY), + mrr_next_idx_(0), + keyread_only_(false), + write_can_replace_(false) +{ + memset(dup_iter_cache_, 0, sizeof(dup_iter_cache_)); + memset(dup_iter_txn_, 0, sizeof(dup_iter_txn_)); + memset(dup_iter_txn_gen_, 0, sizeof(dup_iter_txn_gen_)); +} + +/* ******************** free_dup_iter_cache ******************** */ + +void ha_tidesdb::free_dup_iter_cache() +{ + for (uint i = 0; i < MAX_KEY; i++) + { + if (dup_iter_cache_[i]) + { + tidesdb_iter_free(dup_iter_cache_[i]); + dup_iter_cache_[i] = NULL; + dup_iter_txn_[i] = NULL; + dup_iter_txn_gen_[i] = 0; + } + } + dup_iter_count_ = 0; +} + +/* ******************** get_share ******************** */ + +TidesDB_share *ha_tidesdb::get_share() +{ + TidesDB_share *tmp_share; + DBUG_ENTER("ha_tidesdb::get_share"); + + lock_shared_ha_data(); + if (!(tmp_share = static_cast(get_ha_share_ptr()))) + { + tmp_share = new TidesDB_share; + if (!tmp_share) goto err; + set_ha_share_ptr(static_cast(tmp_share)); + } +err: + unlock_shared_ha_data(); + DBUG_RETURN(tmp_share); +} + +/* ******************** PK / Index key helpers ******************** */ + +/* + We build memcmp-comparable key bytes from record fields for a given KEY. + Uses Field::make_sort_key_part() so that big-endian, sign-bit-flipped encoding + is produced for numeric types -- which sorts correctly under memcmp. + + The record may point to record[0] or record[1]; we adjust field pointers + via move_field_offset to read from the correct buffer. +*/ +uint ha_tidesdb::make_comparable_key(KEY *key_info, const uchar *record, uint num_parts, uchar *out) +{ + uint pos = 0; + my_ptrdiff_t ptrdiff = (my_ptrdiff_t)(record - table->record[0]); + + for (uint p = 0; p < num_parts && p < key_info->user_defined_key_parts; p++) + { + KEY_PART_INFO *kp = &key_info->key_part[p]; + Field *field = kp->field; + + /* We handle the null indicator ourselves using real_maybe_null() + (which checks field-level nullability only) instead of relying on + make_sort_key_part() which uses maybe_null() (includes + table->maybe_null). For inner tables of outer joins, + table->maybe_null is true, causing make_sort_key_part to write + a spurious null indicator byte even for NOT NULL PK fields. + Using make_sort_key() directly avoids this mismatch. */ + field->move_field_offset(ptrdiff); + if (field->real_maybe_null()) + { + if (field->is_null()) + { + out[pos++] = SORT_KEY_NULL; + bzero(out + pos, kp->length); + pos += kp->length; + field->move_field_offset(-ptrdiff); + continue; + } + out[pos++] = SORT_KEY_NOT_NULL; + } + /* For VARBINARY (binary charset variable-length fields), sort_string() + stores the value length in the last length_bytes of the output, + truncating trailing data bytes when the value fills the field. + This causes false duplicate detection on UNIQUE indexes because + different values produce identical sort keys. + Thus for binary charset varstrings, write all data bytes zero-padded + followed by the length, so the full value is preserved. */ + if (field->type() == MYSQL_TYPE_VARCHAR && field->charset() == &my_charset_bin) + { + Field_varstring *fvs = static_cast(field); + String buf; + fvs->val_str(&buf, &buf); + uint data_len = (uint)buf.length(); + uint len_bytes = fvs->length_bytes; + uint data_space = kp->length - len_bytes; + + uint copy_len = MY_MIN(data_len, data_space); + memcpy(out + pos, buf.ptr(), copy_len); + if (copy_len < data_space) bzero(out + pos + copy_len, data_space - copy_len); + pos += data_space; + + /* For values that overflow data_space (value is exactly field_length + bytes), write the overflow bytes into the length area first */ + if (data_len > data_space) + { + uint overflow = MY_MIN(data_len - data_space, len_bytes); + memcpy(out + pos, buf.ptr() + data_space, overflow); + pos += len_bytes; + } + else + { + /* Length suffix in high-byte order (preserves sort order) */ + if (len_bytes == 1) + out[pos] = (uchar)data_len; + else + mi_int2store(out + pos, data_len); + pos += len_bytes; + } + + field->move_field_offset(-ptrdiff); + continue; + } + + field->sort_string(out + pos, kp->length); + field->move_field_offset(-ptrdiff); + pos += kp->length; + } + + return pos; +} + +/* + Convert a key_copy-format search key (as passed to index_read_map) + into the comparable format that we store in TidesDB. + Uses key_restore to unpack into record[1], then make_comparable_key. +*/ +uint ha_tidesdb::key_copy_to_comparable(KEY *key_info, const uchar *key_buf, uint key_len, + uchar *out) +{ + key_restore(table->record[1], key_buf, key_info, key_len); + + uint parts = 0; + uint len = 0; + for (parts = 0; parts < key_info->user_defined_key_parts; parts++) + { + uint part_len = key_info->key_part[parts].store_length; + if (len + part_len > key_len) break; + len += part_len; + } + if (parts == 0) parts = 1; + + return make_comparable_key(key_info, table->record[1], parts, out); +} + +/* + Build PK bytes from a record. + -- With user PK use make_comparable_key for memcmp-correct ordering. + -- Without PK not applicable for NEW rows (caller generates hidden id); + for EXISTING rows current_pk already holds the key. +*/ +uint ha_tidesdb::pk_from_record(const uchar *record, uchar *out) +{ + if (share->has_user_pk) + { + return make_comparable_key(&table->key_info[share->pk_index], record, + table->key_info[share->pk_index].user_defined_key_parts, out); + } + else + { + /* Hidden PK -- we copy current_pk (must have been set by a prior read) */ + memcpy(out, current_pk_buf_, current_pk_len_); + return current_pk_len_; + } +} + +/* + Compute the comparable key byte length for a KEY. + Matches what make_comparable_key() actually produces: + sum of (nullable ? 1 : 0) + kp->length for each key part. + + NOTE -- ki->key_length includes store_length overhead (e.g. 2 bytes + per VARCHAR part for length prefix in key_copy format) which is + not present in the comparable key output. +*/ +static uint comparable_key_length(const KEY *ki) +{ + /* Spatial indexes use a fixed 8-byte Hilbert value as the comparable key.. */ + if (is_spatial_index(ki)) return SPATIAL_HILBERT_KEY_LEN; + + uint len = 0; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + if (ki->key_part[p].field->real_maybe_null()) len++; + len += ki->key_part[p].length; + } + return len; +} + +/* + Build a secondary index CF entry key: + [comparable index-column bytes] + [comparable PK bytes] +*/ +uint ha_tidesdb::sec_idx_key(uint idx, const uchar *record, uchar *out) +{ + KEY *key_info = &table->key_info[idx]; + uint pos = make_comparable_key(key_info, record, key_info->user_defined_key_parts, out); + pos += pk_from_record(record, out + pos); + return pos; +} + +/* + Try to fill record buf with column values decoded from the secondary + index key, avoiding the expensive PK point-lookup. Used when + keyread_only_ is true (covering index scan). + + The secondary index key layout is: + [comparable_idx_cols | comparable_pk] + + Uses decode_sort_key_part() which supports integers, DATE, DATETIME, + TIMESTAMP, YEAR, and fixed-length CHAR/BINARY (binary/latin1). + Returns true on success. +*/ +bool ha_tidesdb::try_keyread_from_index(const uint8_t *ik, size_t iks, uint idx, uchar *buf) +{ + if (!share->has_user_pk) return false; + + KEY *pk_key = &table->key_info[share->pk_index]; + KEY *idx_key = &table->key_info[idx]; + uint idx_col_len = share->idx_comp_key_len[idx]; + + /* We check every column in read_set against the precomputed coverage + bitmap for this index. O(read_set set-bits) instead of the prior + O(set-bits * (pk_parts + idx_parts)) nested scan. */ + if (idx < share->idx_cover.size()) + { + const std::vector &cover = share->idx_cover[idx]; + for (uint c = bitmap_get_first_set(table->read_set); c != MY_BIT_NONE; + c = bitmap_get_next_set(table->read_set, c)) + { + if (c >= cover.size() || !cover[c]) return false; + } + } + else + { + /* Share not populated for this index (shouldn't happen). */ + return false; + } + + const uint8_t *pos = ik; + for (uint p = 0; p < idx_key->user_defined_key_parts; p++) + { + KEY_PART_INFO *kp = &idx_key->key_part[p]; + Field *f = kp->field; + if (f->real_maybe_null()) + { + if (pos >= ik + iks) return false; + if (*pos == 0) + { + f->set_null(); + pos++; + continue; + } + f->set_notnull(); + pos++; + } + if (pos + kp->length > ik + iks) return false; + if (bitmap_is_set(table->read_set, kp->fieldnr - 1)) + { + if (!decode_sort_key_part(pos, kp->length, f, buf)) return false; + } + pos += kp->length; + } + + const uint8_t *pk_start = ik + idx_col_len; + pos = pk_start; + for (uint p = 0; p < pk_key->user_defined_key_parts; p++) + { + KEY_PART_INFO *kp = &pk_key->key_part[p]; + Field *f = kp->field; + if (f->real_maybe_null()) + { + if (pos >= ik + iks) return false; + if (*pos == 0) + { + f->set_null(); + pos++; + continue; + } + f->set_notnull(); + pos++; + } + if (pos + kp->length > ik + iks) return false; + if (bitmap_is_set(table->read_set, kp->fieldnr - 1)) + { + if (!decode_sort_key_part(pos, kp->length, f, buf)) return false; + } + pos += kp->length; + } + + uint pk_bytes = (uint)(iks - idx_col_len); + memcpy(current_pk_buf_, pk_start, pk_bytes); + current_pk_len_ = pk_bytes; + + return true; +} + +/* ******************** ICP (Index Condition Pushdown) helpers ******************** */ + +/* + Reverse a single integer sort-key part (big-endian, sign-bit-flipped) + back to native little-endian at `to`. Caller precomputes `to` so we + don't re-walk f->ptr/f->table->record[0] on every decode. + + MariaDB integer pack widths are TINY=1, SHORT=2, INT24=3, LONG=4, + LONGLONG=8 -- any other width is rejected. The decode is a plain + byte-reverse, with the most-significant byte XORed with the sign + flip mask for signed types so the original native value is recovered. +*/ +bool ha_tidesdb::decode_int_sort_key(const uint8_t *src, uint sort_len, bool is_signed, uchar *to) +{ + if (sort_len == 0 || (sort_len > 4 && sort_len != 8)) return false; + + for (uint i = 0; i < sort_len; i++) to[i] = src[sort_len - 1 - i]; + if (is_signed) to[sort_len - 1] ^= INT_SORT_SIGN_FLIP_MASK; + return true; +} + +/* + Extended sort-key decoder -- handles integers (via decode_int_sort_key), + DATE (3 bytes big-endian), DATETIME/TIMESTAMP (4-8 bytes big-endian), + YEAR (1 byte), and fixed-length CHAR/BINARY (direct memcpy of sort key). + + For integer types, delegates to decode_int_sort_key which handles the + sign-bit-flip + endian reversal. + + For DATE/DATETIME/TIMESTAMP/YEAR, the sort key is big-endian unsigned; + we reverse the byte order to native little-endian without sign-flip + (these types are always unsigned internally). + + For CHAR/BINARY (MYSQL_TYPE_STRING), the sort key produced by + Field_string::sort_string is the charset's sort weight sequence. + For binary/latin1 charsets this is identical to the field content + (padded with spaces to kp->length). We copy it directly. + For multi-byte charsets (utf8) the sort weights differ from the + stored bytes, so we cannot reverse -- return false. + + Returns true on success, false for unsupported types. +*/ +bool ha_tidesdb::decode_sort_key_part(const uint8_t *src, uint sort_len, Field *f, uchar *buf) +{ + /* Compute the destination pointer exactly once per call. Every branch + below wrote `buf + (f->ptr - f->table->record[0])` independently. */ + uchar *to = buf + (uintptr_t)(f->ptr - f->table->record[0]); + + switch (f->real_type()) + { + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + return decode_int_sort_key(src, sort_len, !f->is_unsigned(), to); + + case MYSQL_TYPE_YEAR: + /* YEAR is 1 byte unsigned, sort key is identity */ + to[0] = src[0]; + return true; + + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_NEWDATE: + /* DATE is DATE_PACK_LEN bytes, sort key is big-endian unsigned. + Reverse to native little-endian. */ + if (sort_len == DATE_PACK_LEN) + { + for (uint b = 0; b < sort_len; b++) to[b] = src[sort_len - 1 - b]; + return true; + } + return false; + + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIMESTAMP: + case MYSQL_TYPE_TIMESTAMP2: + /* DATETIME/TIMESTAMP sort keys are big-endian unsigned, at most + DATETIME_MAX_PACK_LEN bytes. Reverse to native little-endian. */ + if (sort_len <= DATETIME_MAX_PACK_LEN) + { + for (uint b = 0; b < sort_len; b++) to[b] = src[sort_len - 1 - b]; + return true; + } + return false; + + case MYSQL_TYPE_STRING: + /* Fixed-length CHAR/BINARY. For binary/latin1 charsets the + sort key is identical to the stored content (space-padded). + For multi-byte charsets we cannot reverse. */ + if (f->charset() == &my_charset_bin || f->charset() == &my_charset_latin1) + { + uint flen = f->pack_length(); + uint copy_len = (sort_len < flen) ? sort_len : flen; + memcpy(to, src, copy_len); + if (copy_len < flen) memset(to + copy_len, ' ', flen - copy_len); + return true; + } + return false; + + default: + return false; + } +} + +/* + Evaluate pushed index condition on a secondary-index entry before + the expensive PK point-lookup (InnoDB pattern). + + Decodes the index key column values and PK column values from the + comparable-format index key into the record buffer, then calls + handler_index_cond_check() which evaluates the pushed condition, + checks end_range, and handles THD kill signals. + + Supports integer types, DATE, DATETIME, TIMESTAMP, YEAR, and + fixed-length CHAR/BINARY (binary/latin1 charset) via + decode_sort_key_part(). For unsupported types, ICP is skipped and + CHECK_POS is returned so the caller falls through to the PK lookup. +*/ +check_result_t ha_tidesdb::icp_check_secondary(const uint8_t *ik, size_t iks, uint idx, uchar *buf) +{ + if (!pushed_idx_cond || pushed_idx_cond_keyno != idx) return CHECK_POS; + + KEY *idx_key = &table->key_info[idx]; + uint idx_col_len = share->idx_comp_key_len[idx]; + bool decode_ok = true; + + /* Decode index column parts from the comparable-format key. + If any part can't be decoded (DECIMAL, VARCHAR, etc.), we fall + back to a full PK row fetch so the condition evaluates correctly. */ + const uint8_t *pos = ik; + for (uint p = 0; p < idx_key->user_defined_key_parts && decode_ok; p++) + { + KEY_PART_INFO *kp = &idx_key->key_part[p]; + Field *f = kp->field; + + if (f->real_maybe_null()) + { + if (pos >= ik + iks) + { + decode_ok = false; + break; + } + if (*pos == 0) + { + f->set_null(); + pos++; + continue; + } + f->set_notnull(); + pos++; + } + if (pos + kp->length > ik + iks) + { + decode_ok = false; + break; + } + if (!decode_sort_key_part(pos, kp->length, f, buf)) decode_ok = false; + pos += kp->length; + } + + /* Decode PK parts from the tail (pushed condition may reference PK columns). */ + if (decode_ok && share->has_user_pk) + { + KEY *pk_key = &table->key_info[share->pk_index]; + pos = ik + idx_col_len; + for (uint p = 0; p < pk_key->user_defined_key_parts && decode_ok; p++) + { + KEY_PART_INFO *kp = &pk_key->key_part[p]; + Field *f = kp->field; + + if (f->real_maybe_null()) + { + if (pos >= ik + iks) + { + decode_ok = false; + break; + } + if (*pos == 0) + { + f->set_null(); + pos++; + continue; + } + f->set_notnull(); + pos++; + } + if (pos + kp->length > ik + iks) + { + decode_ok = false; + break; + } + if (!decode_sort_key_part(pos, kp->length, f, buf)) decode_ok = false; + pos += kp->length; + } + } + + if (!decode_ok) + { + /* Could not decode all key parts from the sort key (unsupported type + like DECIMAL, VARCHAR, multi-byte CHAR). Fall back to a full PK + row fetch so ALL columns are available for condition evaluation. + This is more expensive than pure ICP (still does the PK lookup) + but is correct, the server won't re-evaluate pushed conditions. */ + if (iks > idx_col_len) + { + const uchar *pk = ik + idx_col_len; + uint pk_len = (uint)(iks - idx_col_len); + if (fetch_row_by_pk(scan_txn, pk, pk_len, buf) != 0) + return CHECK_POS; /* PK lookup failed -- we accept row, let caller handle */ + } + else + { + return CHECK_POS; /* malformed key -- we accept */ + } + } + + /* Delegate to MariaDB's ICP evaluator which checks kill state, + end_range, and pushed_idx_cond->val_bool(). */ + return handler_index_cond_check(this); +} + +/* ******************** Counter recovery ******************** */ + +/* + Recover hidden-PK next_row_id from the last data key. + Also seed auto_inc_val for tables with AUTO_INCREMENT user-defined PKs + so that get_auto_increment() can return O(1) instead of doing index_last() + on every INSERT. +*/ +void ha_tidesdb::recover_counters() +{ + tidesdb_txn_t *txn = NULL; + if (tidesdb_txn_begin(tdb_global, &txn) != TDB_SUCCESS) return; + + tidesdb_iter_t *iter = NULL; + if (tdb_iter_new_blocking(ha_thd(), txn, share->cf, &iter) == TDB_SUCCESS) + { + tidesdb_iter_seek_to_last(iter); + if (tidesdb_iter_valid(iter)) + { + uint8_t *key = NULL; + size_t key_size = 0; + if (tidesdb_iter_key(iter, &key, &key_size) == TDB_SUCCESS && + is_data_key(key, key_size)) + { + if (!share->has_user_pk && key_size == KEY_NAMESPACE_LEN + HIDDEN_PK_SIZE) + { + /* Hidden PK -- we decode the big-endian row-id */ + uint64_t max_id = decode_be64(key + KEY_NAMESPACE_LEN); + share->next_row_id.store(max_id + 1, std::memory_order_relaxed); + } + + /* Seeding auto_inc_val from the last row in primary-key order + is only correct when the AUTO_INCREMENT column is the + leftmost part of the primary key, since only then does the + PK-order maximum coincide with the auto-inc maximum. When + the auto-inc column lives elsewhere (a different unique + key) the seed would underestimate the next value and let + get_auto_increment hand out colliding ids, so leave the + counter at zero and let MariaDB seed it on demand. */ + bool auto_inc_is_pk_leftmost = false; + if (share->has_user_pk && table->found_next_number_field) + { + const KEY *pk = &table->key_info[share->pk_index]; + if (pk->user_defined_key_parts > 0 && + pk->key_part[0].field == table->found_next_number_field) + auto_inc_is_pk_leftmost = true; + } + if (auto_inc_is_pk_leftmost) + { + /* User PK with AUTO_INCREMENT -- we read the last row to seed + the in-memory counter from the max PK value. */ + uint8_t *val = NULL; + size_t val_size = 0; + if (tidesdb_iter_value(iter, &val, &val_size) == TDB_SUCCESS) + { + /* We just unpack the packed row into record[1] using the proper + deserialize path so field offsets are correct even when + variable-length fields (CHAR/VARCHAR) precede the + AUTO_INCREMENT column. */ + if (share->has_blobs || share->encrypted) + { + std::string row_data((const char *)val, val_size); + deserialize_row(table->record[1], row_data); + } + else + { + deserialize_row(table->record[1], (const uchar *)val, val_size); + } + /* deserialize_row writes into table->record[1]; + val_int_offset wants the byte offset from record[0] + to record[1]. table->s->rec_buff_length is + normally equal but the API does not guarantee it, + so use the explicit subtraction the deserialize + path already relies on. */ + ulonglong max_val = table->found_next_number_field->val_int_offset( + (my_ptrdiff_t)(table->record[1] - table->record[0])); + share->auto_inc_val.store(max_val, std::memory_order_relaxed); + } + } + } + } + tidesdb_iter_free(iter); + } + + if (!share->has_user_pk && share->next_row_id.load(std::memory_order_relaxed) == 0) + share->next_row_id.store(HIDDEN_PK_FIRST_ROW_ID, std::memory_order_relaxed); + + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); +} + +/* ******************** open / close / create ******************** */ + +int ha_tidesdb::open(const char *name, int mode, uint test_if_locked) +{ + DBUG_ENTER("ha_tidesdb::open"); + + if (!(share = get_share())) DBUG_RETURN(1); + + /* + We resolve CF pointers only once (first open). Subsequent opens by + other connections reuse the already-resolved share. We hold + lock_shared_ha_data() to prevent concurrent open() calls from + racing on the shared vectors. + */ + lock_shared_ha_data(); + if (!share->cf) + { + share->cf_name = path_to_cf_name(name); + share->cf = tidesdb_get_column_family(tdb_global, share->cf_name.c_str()); + if (!share->cf) + { + unlock_shared_ha_data(); + sql_print_error("[TIDESDB] CF '%s' not found for table '%s'", share->cf_name.c_str(), + name); + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + + if (table->s->primary_key != MAX_KEY) + { + share->has_user_pk = true; + share->pk_index = table->s->primary_key; + share->pk_key_len = comparable_key_length(&table->key_info[share->pk_index]); + } + else + { + share->has_user_pk = false; + share->pk_index = MAX_KEY; + share->pk_key_len = HIDDEN_PK_SIZE; + } + + if (TDB_TABLE_OPTIONS(table)) + { + uint iso_idx = TDB_TABLE_OPTIONS(table)->isolation_level; + if (iso_idx < array_elements(tdb_isolation_map)) + share->isolation_level = (tidesdb_isolation_level_t)tdb_isolation_map[iso_idx]; + } + + if (TDB_TABLE_OPTIONS(table)) share->default_ttl = TDB_TABLE_OPTIONS(table)->ttl; + + share->encrypted = false; + share->encryption_key_id = TIDESDB_DEFAULT_ENCRYPTION_KEY_ID; + share->encryption_key_version = 0; + if (TDB_TABLE_OPTIONS(table) && TDB_TABLE_OPTIONS(table)->encrypted) + { + share->encrypted = true; + share->encryption_key_id = (uint)TDB_TABLE_OPTIONS(table)->encryption_key_id; + uint ver = encryption_key_get_latest_version(share->encryption_key_id); + if (ver == ENCRYPTION_KEY_VERSION_INVALID) + { + sql_print_error("[TIDESDB] encryption key %u not available", + share->encryption_key_id); + DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); + } + share->encryption_key_version = ver; + } + + share->ttl_field_idx = TIDESDB_TTL_FIELD_NONE; + for (uint i = 0; i < table->s->fields; i++) + { + if (table->s->field[i]->option_struct && table->s->field[i]->option_struct->ttl) + { + share->ttl_field_idx = (int)i; + break; + } + } + + /* We cache table shape flags for hot-path short-circuiting. We also + capture the BLOB field indices so serialize_row's size estimate can + iterate that short list instead of every field on every INSERT. */ + share->has_blobs = false; + share->blob_field_indices.clear(); + for (uint i = 0; i < table->s->fields; i++) + { + if (table->s->field[i]->flags & BLOB_FLAG) + { + share->has_blobs = true; + share->blob_field_indices.push_back((uint16)i); + } + } + share->has_ttl = (share->default_ttl > 0 || share->ttl_field_idx >= 0); + + /* Per-field serialize/deserialize plan. For each field cache its + offset within record[0] and whether its pack format is a pure + memcpy of pack_length() bytes -- if so the hot loops skip the + Field::pack/unpack vtable dispatch entirely. + + The whitelist below covers the field types whose pack() output + is byte-identical to memcpy(pack_length()) on a little-endian + host. CHAR / VARCHAR / BLOB / GEOMETRY / JSON / BIT / DECIMAL + keep the slow path, their pack() trims trailing pad bytes, + emits a length prefix, or layers a null-bit on top. + + Field_long / Field_longlong / Field_short etc. emit data in + on-disk little-endian via the mi_int*store macros which equal + memcpy on x86_64. TIDESDB_FAST_SERDES_LE_ONLY guards the fast + path so a big-endian build cleanly falls back to Field::pack. */ + share->field_plan.clear(); + share->field_plan.reserve(table->s->fields); + share->null_bytes_cached = (uint8)table->s->null_bytes; + share->fields_cached = (uint16)table->s->fields; + share->has_no_nullable = (table->s->null_bytes == 0); + for (uint i = 0; i < table->s->fields; i++) + { + /* We use the per-instance Field (table->field[i]), not the share + prototype (table->s->field[i]). maybe_null() / real_type() + read through Field::table -- the share prototype is created + with a null table pointer and crashes on those calls. The + per-instance Field has table = this handler's TABLE and is + safe to query. */ + Field *f = table->field[i]; + TidesDB_share::field_plan_t fp; + fp.src_off = (uint32)(f->ptr - table->record[0]); + fp.pack_len = (uint16)f->pack_length(); + fp.maybe_null = f->maybe_null(); + fp.memcpy_ok = false; +#ifndef WORDS_BIGENDIAN + switch (f->real_type()) + { + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_LONGLONG: + case MYSQL_TYPE_FLOAT: + case MYSQL_TYPE_DOUBLE: + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_NEWDATE: + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_TIME2: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIMESTAMP: + case MYSQL_TYPE_TIMESTAMP2: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDECIMAL: + fp.memcpy_ok = true; + break; + default: + fp.memcpy_ok = false; + break; + } + /* BLOB columns share MYSQL_TYPE_LONGLONG underneath in older + codepaths; never fast-path anything carrying BLOB_FLAG. */ + if (f->flags & BLOB_FLAG) fp.memcpy_ok = false; +#endif + share->field_plan.push_back(fp); + } + + /* We precompute comparable key lengths and index-type flags per index. + Caching the type flags avoids a ki->algorithm dereference per row + in write_row's dup-check loop and in update_row/delete_row. */ + for (uint i = 0; i < table->s->keys; i++) + { + share->idx_comp_key_len[i] = comparable_key_length(&table->key_info[i]); + share->idx_is_fts[i] = is_fts_index(&table->key_info[i]); + share->idx_is_spatial[i] = is_spatial_index(&table->key_info[i]); + } + + /* Precompute per-index coverage bitmaps so try_keyread_from_index is + O(set bits in read_set) instead of nested scans over key parts. */ + share->idx_cover.assign(table->s->keys, std::vector(table->s->fields, false)); + for (uint i = 0; i < table->s->keys; i++) + { + const KEY *ki = &table->key_info[i]; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + uint fnr = ki->key_part[p].fieldnr; + if (fnr > 0 && fnr - 1 < table->s->fields) share->idx_cover[i][fnr - 1] = true; + } + /* Secondary indexes also cover the PK columns appended to the key. */ + if (table->s->primary_key != MAX_KEY && i != table->s->primary_key) + { + const KEY *pk_key = &table->key_info[table->s->primary_key]; + for (uint p = 0; p < pk_key->user_defined_key_parts; p++) + { + uint fnr = pk_key->key_part[p].fieldnr; + if (fnr > 0 && fnr - 1 < table->s->fields) share->idx_cover[i][fnr - 1] = true; + } + } + } + + for (uint i = 0; i < table->s->keys; i++) + { + if (share->has_user_pk && i == share->pk_index) + { + share->idx_cfs.push_back(NULL); + share->idx_cf_names.push_back(""); + continue; + } + std::string idx_name; + tidesdb_column_family_t *icf = + resolve_idx_cf(tdb_global, share->cf_name, table->key_info[i].name.str, idx_name); + share->idx_cfs.push_back(icf); + share->idx_cf_names.push_back(idx_name); + } + + share->num_secondary_indexes = 0; + for (uint i = 0; i < share->idx_cfs.size(); i++) + if (share->idx_cfs[i]) share->num_secondary_indexes++; + + /* Allocate the per-index full-cost cache; sized once so the array + can be addressed by index number without a lock. */ + if (!share->idx_cfs.empty()) + { + share->cached_idx_full_cost_n = (uint)share->idx_cfs.size(); + share->cached_idx_full_cost.reset( + new std::atomic[share->cached_idx_full_cost_n]); + share->cached_idx_full_cost_time.reset( + new std::atomic[share->cached_idx_full_cost_n]); + for (uint i = 0; i < share->cached_idx_full_cost_n; i++) + { + share->cached_idx_full_cost[i].store(0.0, std::memory_order_relaxed); + share->cached_idx_full_cost_time[i].store(0, std::memory_order_relaxed); + } + } + + /* We recover hidden-PK counter (auto-inc is derived at runtime via index_last) */ + recover_counters(); + + { + char frm_path[FN_REFLEN]; + fn_format(frm_path, name, "", reg_ext, MY_UNPACK_FILENAME | MY_APPEND_EXT); + MY_STAT st_buf; + if (mysql_file_stat(0, frm_path, &st_buf, MYF(0))) share->create_time = st_buf.st_mtime; + } + } + unlock_shared_ha_data(); + + ref_length = share->pk_key_len; + + /* We mirror shape flags onto the handler so the row-fetch hot paths + read from a local member instead of chasing `share` into shared + memory on every row. These mirror constants that never change for + the open handler. */ + has_blobs_ = share->has_blobs; + encrypted_ = share->encrypted; + + /* We precompute the record[1] pointer range so the BLOB path of + fetch_row_by_pk/iter_read_current doesn't rebuild it per row. */ + if (table->record[1]) + { + record1_lo_ = table->record[1]; + record1_hi_ = table->record[1] + table->s->reclength; + } + else + { + record1_lo_ = NULL; + record1_hi_ = NULL; + } + + DBUG_RETURN(0); +} + +int ha_tidesdb::close(void) +{ + DBUG_ENTER("ha_tidesdb::close"); + if (scan_iter) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + free_dup_iter_cache(); + /* stmt_txn is a borrowed pointer into the per-connection trx->txn. + We do not free it here -- the txn is owned by the per-connection trx + and will be freed in tidesdb_close_connection(). */ + stmt_txn = NULL; + stmt_txn_dirty = false; + DBUG_RETURN(0); +} + +int ha_tidesdb::create(const char *name, TABLE *table_arg, HA_CREATE_INFO *create_info) +{ + DBUG_ENTER("ha_tidesdb::create"); + + std::string cf_name = path_to_cf_name(name); + + ha_table_option_struct *opts = TDB_TABLE_OPTIONS(table_arg); + DBUG_ASSERT(opts); + + /* Under unified-memtable mode the shared WAL's fsync behaviour is owned + by tidesdb_unified_memtable_sync_mode; the per-table SYNC_MODE option + only governs SSTable file sync (klog and vlog). Warn the user when + the two differ so they do not assume the table option controls WAL + durability for this table. */ + if (srv_unified_memtable && opts->sync_mode != srv_unified_memtable_sync_mode) + { + push_warning_printf(ha_thd(), Sql_condition::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, + "[TIDESDB] Table SYNC_MODE=%s governs SSTable file sync only. Under " + "tidesdb_unified_memtable=ON the shared WAL is fsynced according to " + "tidesdb_unified_memtable_sync_mode=%s, so the table option does not " + "change WAL durability for this table", + sync_mode_names[opts->sync_mode], + sync_mode_names[srv_unified_memtable_sync_mode]); + } + + tidesdb_column_family_config_t cfg = build_cf_config(opts); + + /* We create main data CF (we simply skip if it already exists, e.g. crash recovery) */ + if (!tidesdb_get_column_family(tdb_global, cf_name.c_str())) + { + int rc = tidesdb_create_column_family(tdb_global, cf_name.c_str(), &cfg); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] Failed to create CF '%s' (err=%d)", cf_name.c_str(), rc); + DBUG_RETURN(tdb_rc_to_ha(rc, "create main_cf")); + } + } + + /* Per-index USE_BTREE overrides the table-level setting. */ + for (uint i = 0; i < table_arg->s->keys; i++) + { + if (table_arg->s->primary_key != MAX_KEY && i == table_arg->s->primary_key) continue; + + std::string idx_cf = cf_name + CF_INDEX_INFIX + table_arg->key_info[i].name.str; + if (!tidesdb_get_column_family(tdb_global, idx_cf.c_str())) + { + tidesdb_column_family_config_t idx_cfg = cfg; + ha_index_option_struct *iopts = table_arg->key_info[i].option_struct; + if (iopts) idx_cfg.use_btree = iopts->use_btree ? 1 : 0; + + int rc = tidesdb_create_column_family(tdb_global, idx_cf.c_str(), &idx_cfg); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] Failed to create index CF '%s' (err=%d)", idx_cf.c_str(), + rc); + DBUG_RETURN(tdb_rc_to_ha(rc, "create idx_cf")); + } + } + } + + /* We store .frm in schema CF for object store discovery. + When discover_table is registered, MariaDB skips writing .frm to disk + and provides it via TABLE_SHARE::frm_image instead. */ + if (table_arg->s->frm_image) + schema_cf_store_frm(name, table_arg->s->frm_image->str, table_arg->s->frm_image->length); + else + schema_cf_store_frm(name); + + DBUG_RETURN(0); +} + +/* ******************** Data-at-rest encryption helpers ******************** */ + +/* + Encrypt plaintext into out. The on-disk blob is the 4-byte little-endian + key version, then the 16-byte IV, then the ciphertext. Storing the key + version lets tidesdb_decrypt_row recover the exact key a row was written + under, so encrypted rows remain readable across a key rotation. +*/ +static bool tidesdb_encrypt_row_into(const std::string &plain, uint key_id, uint key_version, + std::string &out) +{ + unsigned char key[TIDESDB_ENC_KEY_LEN]; + unsigned int klen = sizeof(key); + /* Fail closed if the keyring cannot satisfy the request (missing version, + buffer too small, plugin not loaded). Without this check the local key + buffer holds uninitialized stack bytes and encryption_crypt would + proceed as if the request had succeeded, producing rows nobody can + decrypt. */ + if (encryption_key_get(key_id, key_version, key, &klen) != 0) + { + sql_print_error("[TIDESDB] encryption_key_get failed for key_id=%u version=%u", key_id, + key_version); + out.clear(); + return false; + } + + unsigned char iv[TIDESDB_ENC_IV_LEN]; + my_random_bytes(iv, TIDESDB_ENC_IV_LEN); + + unsigned int slen = (unsigned int)plain.size(); + unsigned int enc_len = encryption_encrypted_length(slen, key_id, key_version); + out.resize(TIDESDB_ENC_VERSION_LEN + TIDESDB_ENC_IV_LEN + enc_len); + + int4store(&out[0], (uint32)key_version); + memcpy(&out[TIDESDB_ENC_VERSION_LEN], iv, TIDESDB_ENC_IV_LEN); + + unsigned int dlen = enc_len; + int rc = encryption_crypt((const unsigned char *)plain.data(), slen, + (unsigned char *)&out[TIDESDB_ENC_VERSION_LEN + TIDESDB_ENC_IV_LEN], + &dlen, key, klen, iv, TIDESDB_ENC_IV_LEN, ENCRYPTION_FLAG_ENCRYPT, + key_id, key_version); + if (rc != 0) + { + sql_print_error("[TIDESDB] encryption_crypt(encrypt) failed rc=%d", rc); + out.clear(); + return false; + } + out.resize(TIDESDB_ENC_VERSION_LEN + TIDESDB_ENC_IV_LEN + dlen); + return true; +} + +/* + Decrypt a row stored as [key version (4)] [IV (16)] [ciphertext]. The key + version is read back from the blob so a row encrypted before a key rotation + is decrypted with the key it was actually written under, not the latest. +*/ +static std::string tidesdb_decrypt_row(const char *data, size_t len, uint key_id) +{ + if (len <= TIDESDB_ENC_VERSION_LEN + TIDESDB_ENC_IV_LEN) + { + sql_print_error("[TIDESDB] encrypted row too short (%zu bytes)", len); + return std::string(); /* signal failure */ + } + + uint key_version = (uint)uint4korr(data); + + unsigned char key[TIDESDB_ENC_KEY_LEN]; + unsigned int klen = sizeof(key); + /* Fail closed if the keyring cannot return the version this row was + written under (rotated-out key, plugin not loaded, version never + existed). Falling through with an uninitialized key buffer would + feed garbage into encryption_crypt and silently corrupt the + deserialize path. */ + if (encryption_key_get(key_id, key_version, key, &klen) != 0) + { + sql_print_error("[TIDESDB] encryption_key_get failed for key_id=%u version=%u", key_id, + key_version); + return std::string(); /* signal failure to caller */ + } + + const unsigned char *iv = (const unsigned char *)data + TIDESDB_ENC_VERSION_LEN; + const unsigned char *src = + (const unsigned char *)data + TIDESDB_ENC_VERSION_LEN + TIDESDB_ENC_IV_LEN; + unsigned int slen = (unsigned int)(len - TIDESDB_ENC_VERSION_LEN - TIDESDB_ENC_IV_LEN); + + std::string out; + unsigned int dlen = slen + TIDESDB_ENC_KEY_LEN; /* padding slack */ + out.resize(dlen); + + int rc = encryption_crypt(src, slen, (unsigned char *)&out[0], &dlen, key, klen, iv, + TIDESDB_ENC_IV_LEN, ENCRYPTION_FLAG_DECRYPT, key_id, key_version); + if (rc != 0) + { + sql_print_error("[TIDESDB] encryption_crypt(decrypt) failed rc=%d", rc); + return std::string(); /* signal failure */ + } + out.resize(dlen); + return out; +} + +/* ******************** serialize / deserialize (BLOB deep-copy) ******************** */ + +/* Row format header constants live in ha_tidesdb.h so the stop-word + loader and other callers can reference them without forward decls. + Layout is [ROW_HEADER_MAGIC] [null_bytes_stored (2 LE)] [field_count (2 LE)] + for ROW_HEADER_SIZE bytes total. Enables instant ADD/DROP COLUMN. */ + +const std::string &ha_tidesdb::serialize_row(const uchar *buf) +{ + my_ptrdiff_t ptrdiff = (my_ptrdiff_t)(buf - table->record[0]); + + /* Upper-bound packed size. For non-BLOB tables the estimate is constant + (header + null_bytes + reclength + 2 bytes per field for length-prefix + overhead from Field_string::pack). Cache it to avoid recomputing on + every row. For BLOB tables we must add the actual blob data sizes. */ + size_t est = share->cached_row_est; + if (unlikely(est == 0)) + { + est = ROW_HEADER_SIZE + table->s->null_bytes + table->s->reclength + + FIELD_VARCHAR_LEN_PREFIX * table->s->fields; + if (!share->has_blobs) + share->cached_row_est = est; /* safe to cache -- constant for non-BLOB tables */ + } + if (share->has_blobs) + { + /* Walk only the precomputed BLOB field list instead of every field. */ + for (uint16 idx : share->blob_field_indices) + { + Field *f = table->field[idx]; + if (f->is_real_null(ptrdiff)) continue; + Field_blob *blob = (Field_blob *)f; + est += blob->get_length(buf + (uintptr_t)(f->ptr - table->record[0])); + } + } + + row_buf_.resize(est); + uchar *start = (uchar *)&row_buf_[0]; + uchar *pos = start; + + /* Row header -- enables instant ADD/DROP COLUMN by recording the + null bitmap size and field count at write time. */ + *pos++ = ROW_HEADER_MAGIC; + const uint nb = share->null_bytes_cached; + const uint nf = share->fields_cached; + int2store(pos, (uint16)nb); + pos += sizeof(uint16); + int2store(pos, (uint16)nf); + pos += sizeof(uint16); + + /* Null bitmap */ + if (nb) memcpy(pos, buf, nb); + pos += nb; + + /* We pack each non-null field. We use a precomputed per-field plan + (built once at open()) so the hot path skips the Field::pack vtable + dispatch for fields whose pack format is a pure memcpy of + pack_length() bytes -- integers, fixed-precision datetimes, + NEWDECIMAL, FLOAT, DOUBLE. CHAR / VARCHAR / BLOB still go through + Field::pack because their format trims pad bytes or emits a length + prefix. The plan also caches `f->ptr - record[0]` so that + subtraction does not run per row. + + When the table has no nullable fields (share->has_no_nullable), + skip the per-field real_maybe_null branch entirely. */ + const TidesDB_share::field_plan_t *plan = share->field_plan.data(); + const bool all_not_null = share->has_no_nullable; + for (uint i = 0; i < nf; i++) + { + const TidesDB_share::field_plan_t &fp = plan[i]; + if (!all_not_null && fp.maybe_null) + { + if (table->field[i]->is_real_null(ptrdiff)) continue; + } + const uchar *src = buf + fp.src_off; + if (fp.memcpy_ok) + { + memcpy(pos, src, fp.pack_len); + pos += fp.pack_len; + } + else + { + pos = table->field[i]->pack(pos, src); + } + } + + row_buf_.resize((size_t)(pos - start)); + + if (share->encrypted) + { + /* We cache the encryption key version per-statement to avoid the + expensive encryption_key_get_latest_version() syscall on every + single row. The cache is invalidated at statement start + (enc_key_ver_valid_ = false in external_lock). */ + if (!enc_key_ver_valid_) + { + uint cur_ver = encryption_key_get_latest_version(share->encryption_key_id); + if (cur_ver != ENCRYPTION_KEY_VERSION_INVALID) + { + share->encryption_key_version = cur_ver; + cached_enc_key_ver_ = cur_ver; + } + else + { + cached_enc_key_ver_ = share->encryption_key_version; + } + enc_key_ver_valid_ = true; + } + /* We encrypt into enc_buf_ instead of replacing row_buf_, so that + row_buf_'s heap capacity is preserved across calls. + Writing directly into enc_buf_ reuses its heap capacity across rows, + avoiding a per-row allocation when the encrypted size is stable. */ + if (!tidesdb_encrypt_row_into(row_buf_, share->encryption_key_id, cached_enc_key_ver_, + enc_buf_)) + { + enc_buf_.clear(); /* signal failure */ + } + return enc_buf_; + } + + return row_buf_; +} + +void ha_tidesdb::deserialize_row(uchar *buf, const uchar *data, size_t len) +{ + const uchar *from = data; + const uchar *from_end = data + len; + + /* All rows have the header([0xFE] [null_bytes(2)] [field_count(2)]) */ + if (unlikely(len < ROW_HEADER_SIZE || data[0] != ROW_HEADER_MAGIC)) + { + /* Corrupted or truncated row, we zero the record to avoid garbage */ + memset(buf, 0, table->s->reclength); + return; + } + + from++; + uint stored_null_bytes = uint2korr(from); + from += sizeof(uint16); + uint stored_fields = uint2korr(from); + from += sizeof(uint16); + + /* Null bitmap -- we copy the smaller of stored vs current. + When columns were added (stored_null_bytes < table->s->null_bytes), + fill the extra null bitmap bytes from the table's default record + so that new columns inherit their correct DEFAULT / NOT NULL state + rather than blindly marking them NULL. */ + if ((size_t)(from_end - from) < stored_null_bytes) return; + const uint cur_nb = share->null_bytes_cached; + uint copy_nb = MY_MIN(stored_null_bytes, cur_nb); + if (copy_nb) memcpy(buf, from, copy_nb); + if (copy_nb < cur_nb) + memcpy(buf + copy_nb, table->s->default_values + copy_nb, cur_nb - copy_nb); + from += stored_null_bytes; + + /* We unpack. Only unpack up to MIN(stored_fields, current_fields). + If the row has more fields than the current schema (DROP COLUMN), + the extra packed data is simply skipped. + If the row has fewer fields (ADD COLUMN), fill the missing fields + from the table's default record so they get their DEFAULT value. */ + const uint cur_nf = share->fields_cached; + uint unpack_count = MY_MIN(stored_fields, cur_nf); + + /* Pre-fill default values for columns added after this row was written. + Copy each new field's bytes from default_values into buf so that + they have the correct DEFAULT even when the field is NOT NULL. */ + if (stored_fields < cur_nf) + { + const TidesDB_share::field_plan_t *plan_d = share->field_plan.data(); + for (uint i = stored_fields; i < cur_nf; i++) + { + const TidesDB_share::field_plan_t &fp = plan_d[i]; + memcpy(buf + fp.src_off, table->s->default_values + fp.src_off, fp.pack_len); + } + } + + /* memcpy_ok fields write directly to `to` via memcpy, so they never + need move_field_offset. The slow-path branch covers CHAR / VARCHAR + / BLOB; only Field_blob::unpack writes through field->ptr (via + set_ptr), so we only pay the virtual move_field_offset pair when + the destination buffer is not record[0] AND the field needs the + slow path. buf == record[0] (ptrdiff == 0) is the common case + for index scans and PK reads, so the loop avoids the vcall pair + entirely there. */ + const my_ptrdiff_t ptrdiff = (my_ptrdiff_t)(buf - table->record[0]); + const TidesDB_share::field_plan_t *plan = share->field_plan.data(); + const bool all_not_null = share->has_no_nullable; + for (uint i = 0; i < unpack_count; i++) + { + const TidesDB_share::field_plan_t &fp = plan[i]; + if (!all_not_null && fp.maybe_null) + { + if (table->field[i]->is_real_null(ptrdiff)) continue; + } + if (from >= from_end) break; + uchar *to = buf + fp.src_off; + if (fp.memcpy_ok) + { + if (from + fp.pack_len > from_end) break; + memcpy(to, from, fp.pack_len); + from += fp.pack_len; + } + else + { + Field *f = table->field[i]; + const uchar *next; + if (ptrdiff != 0) + { + f->move_field_offset(ptrdiff); + next = f->unpack(to, from, from_end); + f->move_field_offset(-ptrdiff); + } + else + { + next = f->unpack(to, from, from_end); + } + if (!next) break; + from = next; + } + } +} + +void ha_tidesdb::deserialize_row(uchar *buf, const std::string &row) +{ + const std::string *plain = &row; + std::string decrypted; + + if (share->encrypted) + { + decrypted = tidesdb_decrypt_row(row.data(), row.size(), share->encryption_key_id); + if (decrypted.empty()) + { + /* Decryption failed! we zero record to avoid returning garbage */ + memset(buf, 0, table->s->reclength); + return; + } + last_row = std::move(decrypted); + plain = &last_row; + } + + deserialize_row(buf, (const uchar *)plain->data(), plain->size()); +} + +/* ******************** fetch_row_by_pk ******************** */ + +/* + Point-lookup a row by its PK bytes (without namespace prefix). + Sets current_pk + last_row. Returns 0, HA_ERR_KEY_NOT_FOUND, + or HA_ERR_LOCK_DEADLOCK (on TDB_ERR_CONFLICT). +*/ +int ha_tidesdb::fetch_row_by_pk(tidesdb_txn_t *txn, const uchar *pk, uint pk_len, uchar *buf) +{ + /* Pessimistic row lock for point reads. Covers both the direct PK + lookup path (HA_READ_KEY_EXACT) and the secondary-index resolved-PK + path (sec idx returns [prefix][pk]; caller passes the suffix here). + Mode is X for write-intent, S under RR/SR for plain reads; RC/SI + reads take no lock (snapshot suffices). Re-entrant -- a no-op when + the caller already holds the lock in a compatible-or-stronger mode. */ + if (unlikely(srv_pessimistic_locking) && cached_trx_) + { + tdb_lock_mode_t mode; + if (tdb_lock_mode_for_read(cached_thd_, stmt_has_write_lock_, &mode)) + { + int lrc = row_lock_acquire(cached_trx_, pk, pk_len, cached_thd_, mode); + if (lrc) return lrc; + } + } + + uchar dk[DATA_KEY_BUF_LEN]; + uint dk_len = build_data_key(pk, pk_len, dk); + + uint8_t *value = NULL; + size_t value_size = 0; + int rc = tidesdb_txn_get(txn, share->cf, dk, dk_len, &value, &value_size); + if (rc == TDB_ERR_NOT_FOUND) return HA_ERR_KEY_NOT_FOUND; + if (rc != TDB_SUCCESS) return tdb_rc_to_ha(rc, "fetch_row_by_pk"); + + if (likely(!has_blobs_ && !encrypted_)) + { + /* Zero-copy path, we deserialize directly from API buffer */ + deserialize_row(buf, (const uchar *)value, value_size); + tidesdb_free(value); + } + else + { + /* For BLOB tables, Field_blob::unpack() stores pointers into the + source buffer. These pointers must remain valid until the next + fetch into the SAME record buffer. The MariaDB handler API + (e.g., mhnsw vector index maintenance) may interleave reads into + record[0] and record[1], so we maintain two backing buffers: + last_row for record[0] fetches, last_row2 for record[1] fetches. + This prevents a fetch into record[1] from invalidating BLOB + pointers that record[0] still references. + + We identify record[1] using the precomputed bounds set in open(). */ + bool is_rec1 = record1_lo_ && buf >= record1_lo_ && buf < record1_hi_; + std::string &backing = is_rec1 ? last_row2 : last_row; + backing.assign((const char *)value, value_size); + tidesdb_free(value); + deserialize_row(buf, backing); + } + memcpy(current_pk_buf_, pk, pk_len); + current_pk_len_ = pk_len; + + return 0; +} + +/* ******************** compute_row_ttl ******************** */ + +/* + Compute the absolute TTL timestamp for a row being written. + Priority -- per-row TTL_COL value > table-level TTL option > no expiration. + Returns -1 (no expiration) or a future absolute Unix timestamp. +*/ +time_t ha_tidesdb::compute_row_ttl(const uchar *buf) +{ + long long ttl_seconds = 0; + + if (share->ttl_field_idx >= 0) + { + Field *f = table->field[share->ttl_field_idx]; + my_ptrdiff_t ptrdiff = (my_ptrdiff_t)(buf - table->record[0]); + if (!f->is_real_null(ptrdiff)) + { + f->move_field_offset(ptrdiff); + ttl_seconds = f->val_int(); + f->move_field_offset(-ptrdiff); + } + } + + /* Session TTL override, we use cached value to avoid THDVAR + ha_thd() + on every row. The cache is populated once per statement in write_row + / update_row and invalidated in external_lock(F_UNLCK). */ + if (ttl_seconds <= 0) + { + if (cached_sess_ttl_ > 0) ttl_seconds = (long long)cached_sess_ttl_; + } + + if (ttl_seconds <= 0 && share->default_ttl > 0) ttl_seconds = (long long)share->default_ttl; + + if (ttl_seconds <= 0) return TIDESDB_TTL_NONE; + + /* We use cached time(NULL) to avoid the vDSO/syscall per row. + n-second granularity is more than sufficient for TTL. */ + if (!cached_time_valid_) + { + cached_time_ = time(NULL); + cached_time_valid_ = true; + } + + return (time_t)(cached_time_ + ttl_seconds); +} + +/* ******************** iter_read_current ******************** */ + +/* + Read the current iterator position in the main data CF. + Skips non-data keys (meta keys). Sets current_pk + last_row. + Does not advance the iterator. +*/ +int ha_tidesdb::iter_read_current(uchar *buf) +{ + while (scan_iter && tidesdb_iter_valid(scan_iter)) + { + uint8_t *key = NULL; + size_t key_size = 0; + uint8_t *value = NULL; + size_t value_size = 0; + if (tidesdb_iter_key_value(scan_iter, &key, &key_size, &value, &value_size) != TDB_SUCCESS) + return HA_ERR_END_OF_FILE; + + if (!is_data_key(key, key_size)) + { + tidesdb_iter_next(scan_iter); + continue; + } + + current_pk_len_ = (uint)(key_size - KEY_NAMESPACE_LEN); + memcpy(current_pk_buf_, key + KEY_NAMESPACE_LEN, current_pk_len_); + + /* Pessimistic row lock for range/prefix scans. Mode chosen by + write-intent + session isolation; covers SELECT ... FOR UPDATE + plus plain SELECT under RR/SR. + + Under UPDATE/DELETE we deliberately skip the lock here so a + secondary-index scan with ICP does not X-lock every PK it walks + past during filtering; update_row/delete_row reacquire on the + row they actually mutate. SELECT ... FOR UPDATE keeps the + per-row lock because the SQL semantics require locking every + row the cursor exposes, even rows the client never reads. */ + if (unlikely(srv_pessimistic_locking) && cached_trx_ && !stmt_is_update_or_delete_) + { + tdb_lock_mode_t mode; + if (tdb_lock_mode_for_read(cached_thd_, stmt_has_write_lock_, &mode)) + { + int lrc = row_lock_acquire(cached_trx_, current_pk_buf_, current_pk_len_, + cached_thd_, mode); + if (lrc) return lrc; + } + } + + if (likely(!has_blobs_ && !encrypted_)) + { + deserialize_row(buf, (const uchar *)value, value_size); + } + else + { + bool is_rec1 = record1_lo_ && buf >= record1_lo_ && buf < record1_hi_; + std::string &backing = is_rec1 ? last_row2 : last_row; + backing.assign((const char *)value, value_size); + deserialize_row(buf, backing); + } + return 0; + } + return HA_ERR_END_OF_FILE; +} + +/* ******************** write_row (INSERT) ******************** */ + +int ha_tidesdb::write_row(const uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::write_row"); + + /* We need all columns readable for PK extraction, secondary index + key building, serialization, and TTL computation. */ + MY_BITMAP *old_map = tmp_use_all_columns(table, &table->read_set); + + bool pk_auto_generated = false; /* true when PK was auto-generated (guaranteed unique) */ + if (table->next_number_field && buf == table->record[0]) + { + /* If the PK field is 0/NULL, MariaDB's update_auto_increment() will + generate a unique value from our atomic counter. We can skip the + expensive PK uniqueness point-get in that case. + Only safe when the auto-inc field is the ENTIRE PK (single-column). + For composite PKs, auto-inc only guarantees uniqueness within + the auto-inc column, not the full composite key. */ + if (table->next_number_field->val_int() == 0 && share->has_user_pk && + table->key_info[share->pk_index].user_defined_key_parts == 1) + pk_auto_generated = true; + int ai_err = update_auto_increment(); + if (ai_err) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(ai_err); + } + /* We keep the shared counter ahead of any explicitly-supplied value + so that future auto-generated values don't collide. */ + ulonglong val = table->next_number_field->val_int(); + ulonglong cur = share->auto_inc_val.load(std::memory_order_relaxed); + while (val > cur) + { + if (share->auto_inc_val.compare_exchange_weak(cur, val, std::memory_order_relaxed)) + break; + } + } + + uchar pk[MAX_KEY_LENGTH]; + uint pk_len; + if (share->has_user_pk) + { + pk_len = pk_from_record(buf, pk); + } + else + { + /* Hidden PK -- we generate next row-id */ + uint64_t row_id = share->next_row_id.fetch_add(1, std::memory_order_relaxed); + encode_be64(row_id, pk); + pk_len = HIDDEN_PK_SIZE; + } + + uchar dk[DATA_KEY_BUF_LEN]; + uint dk_len = build_data_key(pk, pk_len, dk); + + const std::string &row_data = serialize_row(buf); + if (share->encrypted && row_data.empty()) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(HA_ERR_GENERIC); + } + const uint8_t *row_ptr = (const uint8_t *)row_data.data(); + size_t row_len = row_data.size(); + + /* Lazy txn -- we ensure stmt_txn exists on first data access */ + { + int erc = ensure_stmt_txn(); + if (erc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(erc); + } + } + tidesdb_txn_t *txn = stmt_txn; + stmt_txn_dirty = true; + + /* We use cached pointers from external_lock to avoid per-row overhead. */ + tidesdb_trx_t *trx = cached_trx_; + if (trx) + { + trx->dirty = true; + } + + /* We acquire pessimistic row lock for INSERT when pessimistic_locking=ON. + Without this, INSERT bypasses locks held by SELECT ... FOR UPDATE, + UPDATE, and DELETE on the same PK -- breaking the serialization + guarantee that pessimistic locking is supposed to provide. + We use the comparable PK bytes (pk, pk_len) which are the same key + format used by index_read_map() for lock acquisition. */ + if (unlikely(srv_pessimistic_locking) && share->has_user_pk && trx) + { + int lrc = row_lock_acquire(trx, pk, pk_len, cached_thd_, TDB_LOCK_MODE_X); + if (lrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(lrc); + } + } + + /* We cache THDVAR lookups once per statement. */ + if (!cached_thdvars_valid_) + { + cached_skip_unique_ = THDVAR(cached_thd_, skip_unique_check); + cached_sess_ttl_ = THDVAR(cached_thd_, ttl); + cached_single_delete_primary_ = THDVAR(cached_thd_, single_delete_primary); + cached_thdvars_valid_ = true; + } + + /* We check PK uniqueness before inserting (TidesDB put overwrites silently). + IODKU needs HA_ERR_FOUND_DUPP_KEY so the server can run the UPDATE clause. + REPLACE INTO also needs it when secondary indexes exist (old index entries + must be cleaned up via delete+reinsert). When write_can_replace_ is set + and the table has no secondary indexes, we skip the dup check entirely -- + tidesdb_txn_put will overwrite the old value, which is exactly what REPLACE + wants, saving a full point-lookup per row. + SET SESSION tidesdb_skip_unique_check=1 (bulk load) also bypasses this. + When the PK was auto-generated by our O(1) atomic counter, the value is + guaranteed unique (seeded from max existing value) -- skip the point-get. + The auto-generated guarantee covers only the primary key, so it must + never skip the UNIQUE secondary-index check further down. */ + bool skip_pk_unique = cached_skip_unique_ || pk_auto_generated; + if (share->has_user_pk && !skip_pk_unique && + !(write_can_replace_ && share->num_secondary_indexes == 0)) + { + uint8_t *dup_val = NULL; + size_t dup_len = 0; + int grc = tidesdb_txn_get(txn, share->cf, dk, dk_len, &dup_val, &dup_len); + if (grc == TDB_SUCCESS) + { + tidesdb_free(dup_val); + errkey = lookup_errkey = share->pk_index; + memcpy(dup_ref, pk, pk_len); + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(HA_ERR_FOUND_DUPP_KEY); + } + if (grc != TDB_ERR_NOT_FOUND) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(grc, "write_row pk_dup_check")); + } + } + + /* We check UNIQUE secondary index uniqueness. This honours the + explicit tidesdb_skip_unique_check session contract but never the + pk_auto_generated optimization, which only proves the primary key is + unique and tells us nothing about secondary unique values. + Cached dup-check iterators avoid the catastrophically expensive + tidesdb_iter_new() (O(num_sstables) merge-heap construction) on + every single INSERT. The iterator per unique index is created + once and reused via seek() across rows within the same txn. */ + if (share->num_secondary_indexes > 0 && !cached_skip_unique_) + { + /* trx already cached at top of write_row */ + uint64_t cur_gen = trx ? trx->txn_generation : 0; + + for (uint i = 0; i < table->s->keys; i++) + { + if (share->has_user_pk && i == share->pk_index) continue; + if (i >= share->idx_cfs.size() || !share->idx_cfs[i]) continue; + if (share->idx_is_fts[i] || share->idx_is_spatial[i]) continue; + if (!(table->key_info[i].flags & HA_NOSAME)) continue; + + uchar idx_prefix[MAX_KEY_LENGTH]; + uint idx_prefix_len = make_comparable_key( + &table->key_info[i], buf, table->key_info[i].user_defined_key_parts, idx_prefix); + + /* Pessimistic row lock on the UNIQUE-secondary prefix. Without + this, the dup-check below uses the txn's MVCC view and two + concurrent INSERTs of the same unique value can both pass + the check and both commit, producing a logical UNIQUE + violation. Locking the prefix serialises the check+put on + the same value across writers. */ + if (unlikely(srv_pessimistic_locking) && trx) + { + int lrc = + row_lock_acquire(trx, idx_prefix, idx_prefix_len, cached_thd_, TDB_LOCK_MODE_X); + if (lrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(lrc); + } + } + + /* We get or create cached dup-check iterator for this index. + Invalidate if the txn changed (commit/reset frees txn ops + that the iterator's MERGE_SOURCE_TXN_OPS depends on). */ + tidesdb_iter_t *dup_iter = dup_iter_cache_[i]; + if (dup_iter && (dup_iter_txn_[i] != txn || dup_iter_txn_gen_[i] != cur_gen)) + { + tidesdb_iter_free(dup_iter); + dup_iter = NULL; + dup_iter_cache_[i] = NULL; + } + if (!dup_iter) + { + { + int irc = tdb_iter_new_blocking(ha_thd(), txn, share->idx_cfs[i], &dup_iter); + if (irc != TDB_SUCCESS || !dup_iter) + { + /* Iterator creation failed, thus cannot safely skip the + uniqueness check or we risk silent UNIQUE violations. + Propagate the error to the caller. */ + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(irc, "write_row dup_iter_new")); + } + } + dup_iter_cache_[i] = dup_iter; + dup_iter_txn_[i] = txn; + dup_iter_txn_gen_[i] = cur_gen; + dup_iter_count_++; + } + + tidesdb_iter_seek(dup_iter, idx_prefix, idx_prefix_len); + if (tidesdb_iter_valid(dup_iter)) + { + uint8_t *fk = NULL; + size_t fks = 0; + if (tidesdb_iter_key(dup_iter, &fk, &fks) == TDB_SUCCESS && fks >= idx_prefix_len && + memcmp(fk, idx_prefix, idx_prefix_len) == 0) + { + /* We extract PK suffix from the index key for dup_ref */ + size_t dup_pk_len = fks - idx_prefix_len; + if (dup_pk_len > 0 && dup_pk_len <= ref_length) + memcpy(dup_ref, fk + idx_prefix_len, dup_pk_len); + errkey = lookup_errkey = i; + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(HA_ERR_FOUND_DUPP_KEY); + } + } + } + } + + /* We compute TTL when the table has TTL configured or the session overrides it. + Uses cached_sess_ttl_ to avoid THDVAR + ha_thd() per row. */ + time_t row_ttl = + (share->has_ttl || cached_sess_ttl_ > 0) ? compute_row_ttl(buf) : TIDESDB_TTL_NONE; + + int rc = + tdb_txn_put_blocking(cached_thd_, txn, share->cf, dk, dk_len, row_ptr, row_len, row_ttl); + if (rc != TDB_SUCCESS) goto err; + + memcpy(current_pk_buf_, pk, pk_len); + current_pk_len_ = pk_len; + /* We maintain all secondary indexes in a single consolidated loop. + Loop invariants are hoisted to avoid redundant pointer dereferences + per iteration. Regular, FTS, and spatial indexes are dispatched + inline to eliminate 2/3 of loop overhead vs 3 separate loops. */ + if (share->num_secondary_indexes > 0) + { + const uint num_keys = table->s->keys; + const bool has_user_pk = share->has_user_pk; + const uint pk_index = share->pk_index; + const size_t idx_cfs_sz = share->idx_cfs.size(); + + for (uint i = 0; i < num_keys; i++) + { + if (has_user_pk && i == pk_index) continue; + if (i >= idx_cfs_sz || !share->idx_cfs[i]) continue; + + const KEY *ki = &table->key_info[i]; + + if (ki->algorithm == HA_KEY_ALG_FULLTEXT) + { + /* FTS index maintenance */ + CHARSET_INFO *fts_cs = ki->key_part[0].field->charset(); + std::vector fts_tokens; + fts_extract_and_tokenize(table, ki, buf, fts_cs, fts_tokens); + + std::unordered_map tf_map; + for (auto &tok : fts_tokens) tf_map[tok.word]++; + uint32 word_count = (uint32)fts_tokens.size(); + + for (auto &kv : tf_map) + { + const auto &term = kv.first; + auto &tf = kv.second; + uchar fk[FTS_KEY_BUF_LEN]; + uint fk_len = fts_build_key(term.data(), (uint)term.size(), pk, pk_len, fk); + uchar fv[FTS_VALUE_LEN]; + fts_build_value(tf, word_count, fv); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], fk, fk_len, fv, + FTS_VALUE_LEN, row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + + trx_fts_meta_accumulate(trx, share->cf, i, FTS_DOC_DELTA_ADD, (int64_t)word_count); + } + else if (is_spatial_index(ki)) + { + /* Spatial index maintenance */ + Field *geom_field = ki->key_part[0].field; + my_ptrdiff_t ptd = (my_ptrdiff_t)(buf - table->record[0]); + if (ptd) geom_field->move_field_offset(ptd); + String geom_str; + geom_field->val_str(&geom_str, &geom_str); + if (ptd) geom_field->move_field_offset(-ptd); + + double xmin, ymin, xmax, ymax; + if (geom_str.length() > 0 && + spatial_compute_mbr((const uchar *)geom_str.ptr(), geom_str.length(), &xmin, + &ymin, &xmax, &ymax)) + { + double cx = (xmin + xmax) / MBR_CENTROID_DIV; + double cy = (ymin + ymax) / MBR_CENTROID_DIV; + uchar sk[SPATIAL_HILBERT_KEY_LEN + MAX_KEY_LENGTH]; + uint sk_len = spatial_build_key(cx, cy, pk, pk_len, sk); + uchar sv[SPATIAL_MBR_VALUE_LEN]; + spatial_build_value(xmin, ymin, xmax, ymax, sv); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], sk, sk_len, sv, + SPATIAL_MBR_VALUE_LEN, row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + } + else + { + /* Regular secondary index maintenance */ + uchar ik[SEC_IDX_KEY_BUF_LEN]; + uint ik_len = sec_idx_key(i, buf, ik); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], ik, ik_len, + &tdb_empty_val, sizeof(tdb_empty_val), row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + } + } + + /* We track ops for bulk insert batching (1 data + N secondary index puts) */ + if (in_bulk_insert_) + { + bulk_insert_ops_ += 1 + share->num_secondary_indexes; + if (bulk_insert_ops_ >= TIDESDB_BULK_INSERT_BATCH_OPS) + { + int mrc = maybe_bulk_commit(trx); + if (mrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(mrc); + } + bulk_insert_ops_ = 0; + } + } + + /* Commit happens in external_lock(F_UNLCK). */ + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(0); + +err: + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(rc, "write_row")); +} + +/* ******************** AUTO_INCREMENT (O(1) atomic counter) ******************** */ + +/* + Override the default get_auto_increment() which calls index_last() on every + single auto-commit INSERT. That creates and destroys a TidesDB merge-heap + iterator each time -- O(N sources). Instead, we maintain an in-memory atomic + counter on TidesDB_share that is seeded once from the table data at open time + and atomically incremented thereafter -- O(1). +*/ +void ha_tidesdb::get_auto_increment(ulonglong offset, ulonglong increment, + ulonglong nb_desired_values, ulonglong *first_value, + ulonglong *nb_reserved_values) +{ + DBUG_ENTER("ha_tidesdb::get_auto_increment"); + + /* Atomic fetch-and-add -- each caller gets a unique range. + The counter stores the last value that was handed out. */ + ulonglong cur = share->auto_inc_val.load(std::memory_order_relaxed); + ulonglong next; + do + { + next = cur + nb_desired_values; + } while (!share->auto_inc_val.compare_exchange_weak(cur, next, std::memory_order_relaxed)); + + *first_value = cur + 1; + /* + We reserve exactly what was asked for. MariaDB's update_auto_increment() + will call us again when the interval is exhausted. + */ + *nb_reserved_values = nb_desired_values; + + DBUG_VOID_RETURN; +} + +/* + Reset the auto-increment counter(s) to the given value. MariaDB's default + truncate() path calls this after delete_all_rows, and ALTER TABLE ... + AUTO_INCREMENT=N routes here as well. The next auto-generated ID equals + `value` itself, so we store `value - 1` (get_auto_increment does + fetch-add and returns cur+1). `value == 0` is the TRUNCATE case reset + to 1. Hidden-PK row-id gets the same treatment for consistency. +*/ +int ha_tidesdb::reset_auto_increment(ulonglong value) +{ + DBUG_ENTER("ha_tidesdb::reset_auto_increment"); + if (!share) DBUG_RETURN(0); + + ulonglong new_val = value > 0 ? value - 1 : 0; + share->auto_inc_val.store(new_val, std::memory_order_relaxed); + + /* Hidden PK row-ids are one-based (delete_all_rows stores + HIDDEN_PK_FIRST_ROW_ID for empty tables). Treat value==0 as restart. */ + uint64_t new_rowid = value > 0 ? (uint64_t)value : HIDDEN_PK_FIRST_ROW_ID; + share->next_row_id.store(new_rowid, std::memory_order_relaxed); + + DBUG_RETURN(0); +} + +/* ******************** Table scan (SELECT) ******************** */ + +int ha_tidesdb::rnd_init(bool scan) +{ + DBUG_ENTER("ha_tidesdb::rnd_init"); + + current_pk_len_ = 0; + scan_dir_ = DIR_NONE; + + /* Lazy txn, we ensure stmt_txn exists */ + { + int erc = ensure_stmt_txn(); + if (erc) DBUG_RETURN(erc); + } + scan_txn = stmt_txn; + + /* We use cached trx pointer (set in external_lock) to avoid + ha_thd() virtual dispatch + thd_get_ha_data() hash lookup + on every scan init -- this is a hot path in nested-loop joins. */ + uint64_t cur_gen = cached_trx_ ? cached_trx_->txn_generation : 0; + + if (scan_iter && + (scan_iter_cf_ != share->cf || scan_iter_txn_ != scan_txn || scan_iter_txn_gen_ != cur_gen)) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + + if (!scan_iter) + { + int rc = tdb_iter_new_blocking(ha_thd(), scan_txn, share->cf, &scan_iter); + if (rc != TDB_SUCCESS) + { + scan_txn = NULL; + DBUG_RETURN(tdb_rc_to_ha(rc, "rnd_init txn_begin")); + } + scan_iter_cf_ = share->cf; + scan_iter_txn_ = scan_txn; + scan_iter_txn_gen_ = cur_gen; + } + + uint8_t data_prefix = KEY_NS_DATA; + tidesdb_iter_seek(scan_iter, &data_prefix, 1); + + DBUG_RETURN(0); +} + +int ha_tidesdb::rnd_end() +{ + DBUG_ENTER("ha_tidesdb::rnd_end"); + + /* We do not free scan_iter, we keep cached for reuse within this statement. + Iterator is freed in external_lock(F_UNLCK) or close(). */ + scan_txn = NULL; + + DBUG_RETURN(0); +} + +int ha_tidesdb::rnd_next(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::rnd_next"); + + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + /* We advance past the last-read entry. on the first call after rnd_init + * the iterator is already positioned at the first data key by the seek + * in rnd_init, so we skip the advance (scan_dir_ == DIR_NONE). */ + if (scan_dir_ != DIR_NONE) tidesdb_iter_next(scan_iter); + + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_FORWARD; + + DBUG_RETURN(ret); +} + +/* ******************** position / rnd_pos ******************** */ + +void ha_tidesdb::position(const uchar *record) +{ + DBUG_ENTER("ha_tidesdb::position"); + memcpy(ref, current_pk_buf_, current_pk_len_); + DBUG_VOID_RETURN; +} + +int ha_tidesdb::rnd_pos(uchar *buf, uchar *pos) +{ + DBUG_ENTER("ha_tidesdb::rnd_pos"); + + /* Lazy txn, we ensure stmt_txn exists */ + { + int erc = ensure_stmt_txn(); + if (erc) DBUG_RETURN(erc); + } + + int ret = fetch_row_by_pk(stmt_txn, pos, ref_length, buf); + DBUG_RETURN(ret); +} + +/* ******************** Index scan ******************** */ + +int ha_tidesdb::index_init(uint idx, bool sorted) +{ + DBUG_ENTER("ha_tidesdb::index_init"); + active_index = idx; + idx_pk_exact_done_ = false; + scan_dir_ = DIR_NONE; + spatial_scan_active_ = false; + /* Cache is_pk for the duration of the scan so navigation methods can + read a member instead of re-deriving the answer per row. */ + is_pk_ = share->has_user_pk && idx == share->pk_index; + + { + int erc = ensure_stmt_txn(); + if (erc) DBUG_RETURN(erc); + } + scan_txn = stmt_txn; + + tidesdb_column_family_t *target_cf; + if (share->has_user_pk && idx == share->pk_index) + target_cf = share->cf; + else if (idx < share->idx_cfs.size() && share->idx_cfs[idx]) + target_cf = share->idx_cfs[idx]; + else + { + scan_txn = NULL; + scan_cf_ = NULL; + sql_print_error("[TIDESDB] index_init: no CF for index %u", idx); + DBUG_RETURN(HA_ERR_GENERIC); + } + + scan_cf_ = target_cf; + + /* We reuse cached iterator if it belongs to the same CF and same txn. + In nested-loop joins, index_init/index_end cycle N times on the + same index; reusing the iterator avoids N expensive iter_new() calls + (each builds a merge heap from all SSTables). + + If the txn changed (e.g. after COMMIT created a new one), the + iterator holds a stale txn pointer and must be recreated. + We compare both the pointer and a monotonic generation counter + because the allocator can reuse the same address for a new txn. + + We use cached_trx_ (set in external_lock) to avoid ha_thd() virtual + dispatch + thd_get_ha_data() hash lookup on every iteration of + the outer loop in nested-loop joins. */ + uint64_t cur_gen = cached_trx_ ? cached_trx_->txn_generation : 0; + + if (scan_iter && + (scan_iter_cf_ != target_cf || scan_iter_txn_ != scan_txn || scan_iter_txn_gen_ != cur_gen)) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + /* If scan_iter is non-NULL here, ensure_scan_iter() will reuse it. */ + + DBUG_RETURN(0); +} + +/* + Lazily create the scan iterator from scan_cf_ when first needed. + Returns 0 on success or a handler error code. +*/ +int ha_tidesdb::ensure_scan_iter() +{ + if (scan_iter) return 0; + + /* If a prior attempt with this exact (scan_cf_, scan_txn) combination + already failed, short-circuit instead of re-logging and re-failing. + The cache is invalidated whenever the caller changes scan_cf_ or + scan_txn (natural since those moves imply a new attempt). */ + if (scan_iter_last_err_ && scan_iter_last_err_cf_ == scan_cf_ && + scan_iter_last_err_txn_ == scan_txn) + return scan_iter_last_err_; + + if (!scan_txn || !scan_cf_) + { + sql_print_error("[TIDESDB] ensure_scan_iter: no txn or CF"); + scan_iter_last_err_ = HA_ERR_GENERIC; + scan_iter_last_err_cf_ = scan_cf_; + scan_iter_last_err_txn_ = scan_txn; + return HA_ERR_GENERIC; + } + int rc = tdb_iter_new_blocking(ha_thd(), scan_txn, scan_cf_, &scan_iter); + if (rc == TDB_SUCCESS) + { + scan_iter_cf_ = scan_cf_; + scan_iter_txn_ = scan_txn; + scan_iter_txn_gen_ = cached_trx_ ? cached_trx_->txn_generation : 0; + scan_iter_last_err_ = 0; + return 0; + } + int herr = tdb_rc_to_ha(rc, "ensure_scan_iter"); + scan_iter_last_err_ = herr; + scan_iter_last_err_cf_ = scan_cf_; + scan_iter_last_err_txn_ = scan_txn; + return herr; +} + +int ha_tidesdb::index_end() +{ + DBUG_ENTER("ha_tidesdb::index_end"); + + scan_txn = NULL; + active_index = MAX_KEY; + spatial_scan_active_ = false; + pk_partial_exact_active_ = false; + + DBUG_RETURN(0); +} + +int ha_tidesdb::index_read_map(uchar *buf, const uchar *key, key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + DBUG_ENTER("ha_tidesdb::index_read_map"); + + /* key_copy_to_comparable uses key_restore + make_comparable_key, + which reads fields via make_sort_key_part. */ + MY_BITMAP *old_map = tmp_use_all_columns(table, &table->read_set); + + uint key_len = calculate_key_len(table, active_index, key, keypart_map); + + /* We convert the key_copy-format search key to our comparable format */ + KEY *ki = &table->key_info[active_index]; + uchar comp_key[MAX_KEY_LENGTH]; + uint comp_len = key_copy_to_comparable(ki, key, key_len, comp_key); + + tmp_restore_column_map(&table->read_set, old_map); + + memcpy(idx_search_comp_, comp_key, comp_len); + idx_search_comp_len_ = comp_len; + /* Reset by default; only the partial-PK exact branch below re-sets it. */ + pk_partial_exact_active_ = false; + + if (is_pk_) + { + uchar seek_key[DATA_KEY_BUF_LEN]; + uint seek_len = build_data_key(comp_key, comp_len, seek_key); + + if (find_flag == HA_READ_KEY_EXACT) + { + uint full_pk_comp_len = share->idx_comp_key_len[share->pk_index]; + if (comp_len >= full_pk_comp_len) + { + /* Full PK match, point lookup only, no iterator needed. + Pessimistic row locking happens inside fetch_row_by_pk + (covers the autocommit UPDATE bypass case as well, since + stmt_has_write_lock_ gates write-intent reads regardless + of multi-statement context). */ + int ret = fetch_row_by_pk(scan_txn, comp_key, comp_len, buf); + if (ret == 0) idx_pk_exact_done_ = true; + DBUG_RETURN(ret); + } + + /* Partial PK prefix (e.g. first column of composite PK). + We need an iterator-based prefix scan -- seek to the first + matching data key and let index_next_same iterate through + all entries sharing this prefix. */ + { + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + } + tidesdb_iter_seek(scan_iter, seek_key, seek_len); + pk_partial_exact_active_ = true; + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + + /* All other PK scan modes need the iterator */ + { + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + } + + if (find_flag == HA_READ_KEY_OR_NEXT || find_flag == HA_READ_AFTER_KEY) + { + tidesdb_iter_seek(scan_iter, seek_key, seek_len); + + if (find_flag == HA_READ_AFTER_KEY && tidesdb_iter_valid(scan_iter)) + { + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) == TDB_SUCCESS && iks == seek_len && + memcmp(ik, seek_key, iks) == 0) + tidesdb_iter_next(scan_iter); + } + + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + else if (find_flag == HA_READ_KEY_OR_PREV || find_flag == HA_READ_BEFORE_KEY || + find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_PREFIX_LAST_OR_PREV) + { + tidesdb_iter_seek_for_prev(scan_iter, seek_key, seek_len); + if (find_flag == HA_READ_BEFORE_KEY && tidesdb_iter_valid(scan_iter)) + { + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) == TDB_SUCCESS && iks == seek_len && + memcmp(ik, seek_key, iks) == 0) + tidesdb_iter_prev(scan_iter); + } + + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_BACKWARD; + DBUG_RETURN(ret); + } + + /* Fallback is to seek forward */ + tidesdb_iter_seek(scan_iter, seek_key, seek_len); + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + else + { + /* -- Spatial index MBR query, hilbert range scan with MBR post-filter */ + if (is_spatial_index(&table->key_info[active_index]) && find_flag >= HA_READ_MBR_CONTAIN && + find_flag <= HA_READ_MBR_EQUAL) + { + tdb_mbr_t qmbr; + spatial_parse_query_mbr(key, &qmbr); + spatial_qmbr_[MBR_XMIN_IDX] = qmbr.xmin; + spatial_qmbr_[MBR_YMIN_IDX] = qmbr.ymin; + spatial_qmbr_[MBR_XMAX_IDX] = qmbr.xmax; + spatial_qmbr_[MBR_YMAX_IDX] = qmbr.ymax; + spatial_mode_ = find_flag; + + spatial_scan_active_ = true; + + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + + /* We decompose the query box into hilbert curve ranges. + For DISJOINT, we must scan everything (disjoint entries + can be anywhere on the curve). For other predicates, + we compute a tight set of ranges covering only the cells + that overlap the query box. */ + if (find_flag == HA_READ_MBR_DISJOINT) + { + spatial_ranges_.clear(); + spatial_ranges_.push_back({HILBERT_RANGE_FULL_LO, HILBERT_RANGE_FULL_HI}); + } + else + { + uint32_t qx0 = double_to_lex_uint32(qmbr.xmin); + uint32_t qy0 = double_to_lex_uint32(qmbr.ymin); + uint32_t qx1 = double_to_lex_uint32(qmbr.xmax); + uint32_t qy1 = double_to_lex_uint32(qmbr.ymax); + spatial_decompose_ranges(qx0, qy0, qx1, qy1, spatial_ranges_); + } + spatial_range_idx_ = 0; + + if (!spatial_ranges_.empty()) + { + uchar seek_key[SPATIAL_HILBERT_KEY_LEN]; + encode_hilbert_be(spatial_ranges_[0].first, seek_key); + tidesdb_iter_seek(scan_iter, seek_key, SPATIAL_HILBERT_KEY_LEN); + } + + DBUG_RETURN(spatial_scan_next(buf)); + } + + /* Secondary index read, needs an iterator */ + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + + if (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_KEY_OR_NEXT) + { + tidesdb_iter_seek(scan_iter, comp_key, comp_len); + } + else if (find_flag == HA_READ_AFTER_KEY) + { + /* We seek, then skip past any exact prefix matches */ + tidesdb_iter_seek(scan_iter, comp_key, comp_len); + while (tidesdb_iter_valid(scan_iter)) + { + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) break; + if (iks < comp_len || memcmp(ik, comp_key, comp_len) != 0) break; + tidesdb_iter_next(scan_iter); + } + } + else if (find_flag == HA_READ_KEY_OR_PREV || find_flag == HA_READ_BEFORE_KEY || + find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_PREFIX_LAST_OR_PREV) + { + /* We build upper bound, comp_key with all 0xFF appended for pk portion */ + uchar upper[SEC_IDX_KEY_BUF_LEN]; + memcpy(upper, comp_key, comp_len); + memset(upper + comp_len, KEY_INF_HI_BYTE, share->pk_key_len); + uint upper_len = comp_len + share->pk_key_len; + tidesdb_iter_seek_for_prev(scan_iter, upper, upper_len); + } + else + { + tidesdb_iter_seek(scan_iter, comp_key, comp_len); + } + + /* We read the current entry from the secondary index. + ICP loop, we evaluate pushed index condition before the expensive + PK point-lookup. Entries that fail the condition are skipped + without touching the data CF (same pattern as InnoDB). */ + bool is_backward = + (find_flag == HA_READ_KEY_OR_PREV || find_flag == HA_READ_BEFORE_KEY || + find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_PREFIX_LAST_OR_PREV); + + uint idx_col_len = share->idx_comp_key_len[active_index]; + + for (;;) + { + if (!tidesdb_iter_valid(scan_iter)) DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) + DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + + /* For EXACT match, we verify the index prefix matches */ + if (find_flag == HA_READ_KEY_EXACT) + { + if (iks < comp_len || memcmp(ik, comp_key, comp_len) != 0) + DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + } + + if (iks <= idx_col_len) DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); + + /* ICP -- we evaluate pushed condition on index columns before PK lookup */ + check_result_t icp = icp_check_secondary(ik, iks, active_index, buf); + if (icp == CHECK_NEG) + { + if (is_backward) + tidesdb_iter_prev(scan_iter); + else + tidesdb_iter_next(scan_iter); + continue; /* skip this entry */ + } + if (icp == CHECK_OUT_OF_RANGE) DBUG_RETURN(HA_ERR_END_OF_FILE); + if (icp == CHECK_ABORTED_BY_USER) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + /* CHECK_POS -- condition satisfied (or ICP not applicable) */ + int ret; + if (keyread_only_ && try_keyread_from_index(ik, iks, active_index, buf)) + ret = 0; + else + ret = fetch_row_by_pk(scan_txn, ik + idx_col_len, (uint)(iks - idx_col_len), buf); + if (ret == 0) + { + scan_dir_ = is_backward ? DIR_BACKWARD : DIR_FORWARD; + } + DBUG_RETURN(ret); + } + } +} + +int ha_tidesdb::index_next(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::index_next"); + + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + /* Spatial idx continuation */ + if (spatial_scan_active_) + { + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + if (scan_dir_ != DIR_NONE) tidesdb_iter_next(scan_iter); + DBUG_RETURN(spatial_scan_next(buf)); + } + + if (idx_pk_exact_done_) + { + idx_pk_exact_done_ = false; + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + uchar seek_key[DATA_KEY_BUF_LEN]; + uint seek_len = build_data_key(current_pk_buf_, current_pk_len_, seek_key); + tidesdb_iter_seek(scan_iter, seek_key, seek_len); + if (tidesdb_iter_valid(scan_iter)) tidesdb_iter_next(scan_iter); + /* iterator is now past the PK exact match -- advance+read below */ + } + else + { + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + /* We advance past the last-read entry (iterator stays at current + * with no pre-advance). On the first call after index_first + * sets DIR_NONE, the iterator is already at the correct position + * so we must not advance. */ + if (scan_dir_ != DIR_NONE) tidesdb_iter_next(scan_iter); + } + + if (is_pk_) + { + int ret = iter_read_current(buf); + if (ret == 0 && pk_partial_exact_active_ && idx_search_comp_len_ > 0) + { + /* Continuation of a partial-PK HA_READ_KEY_EXACT scan: the + iterator might have stepped past the prefix. Validate the + PK still starts with the original search bytes; if not the + scan is finished. index_next_same already does this; the + PK branch never did, so a plan that called index_next + (not index_next_same) after a partial-PK exact seek would + return rows from beyond the requested prefix. */ + if (current_pk_len_ < idx_search_comp_len_ || + memcmp(current_pk_buf_, idx_search_comp_, idx_search_comp_len_) != 0) + { + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + } + scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + else + { + /* Secondary index -- ICP loop -- we skip entries that fail the pushed + condition without the expensive PK point-lookup. */ + uint idx_key_len = share->idx_comp_key_len[active_index]; + for (;;) + { + if (!tidesdb_iter_valid(scan_iter)) DBUG_RETURN(HA_ERR_END_OF_FILE); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + if (iks <= idx_key_len) DBUG_RETURN(HA_ERR_END_OF_FILE); + + /* ICP -- we evaluate pushed condition before PK lookup */ + check_result_t icp = icp_check_secondary(ik, iks, active_index, buf); + if (icp == CHECK_NEG) + { + tidesdb_iter_next(scan_iter); + continue; + } + if (icp == CHECK_OUT_OF_RANGE) DBUG_RETURN(HA_ERR_END_OF_FILE); + if (icp == CHECK_ABORTED_BY_USER) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + int ret; + if (keyread_only_ && try_keyread_from_index(ik, iks, active_index, buf)) + ret = 0; + else + ret = fetch_row_by_pk(scan_txn, ik + idx_key_len, (uint)(iks - idx_key_len), buf); + scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + } +} + +int ha_tidesdb::index_prev(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::index_prev"); + + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + /* If PK exact match was done without iterator, we create it now and + seek to the matched key so that prev() steps before it. */ + if (idx_pk_exact_done_) + { + idx_pk_exact_done_ = false; + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + uchar seek_key[DATA_KEY_BUF_LEN]; + uint seek_len = build_data_key(current_pk_buf_, current_pk_len_, seek_key); + tidesdb_iter_seek(scan_iter, seek_key, seek_len); + /* iterator is at the matched key -- fall through to prev() */ + } + else + { + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + } + + tidesdb_iter_prev(scan_iter); + + if (is_pk_) + { + while (tidesdb_iter_valid(scan_iter)) + { + uint8_t *key = NULL; + size_t ks = 0; + if (tidesdb_iter_key(scan_iter, &key, &ks) != TDB_SUCCESS) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (is_data_key(key, ks)) break; + tidesdb_iter_prev(scan_iter); + } + scan_dir_ = DIR_BACKWARD; + DBUG_RETURN(iter_read_current(buf)); + } + else + { + /* Secondary index -- ICP loop (backward direction) */ + uint idx_key_len = share->idx_comp_key_len[active_index]; + for (;;) + { + if (!tidesdb_iter_valid(scan_iter)) DBUG_RETURN(HA_ERR_END_OF_FILE); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + if (iks <= idx_key_len) DBUG_RETURN(HA_ERR_END_OF_FILE); + + /* ICP -- we evaluate pushed condition before PK lookup */ + check_result_t icp = icp_check_secondary(ik, iks, active_index, buf); + if (icp == CHECK_NEG) + { + tidesdb_iter_prev(scan_iter); + continue; + } + if (icp == CHECK_OUT_OF_RANGE) DBUG_RETURN(HA_ERR_END_OF_FILE); + if (icp == CHECK_ABORTED_BY_USER) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + scan_dir_ = DIR_BACKWARD; + int ret; + if (keyread_only_ && try_keyread_from_index(ik, iks, active_index, buf)) + ret = 0; + else + ret = fetch_row_by_pk(scan_txn, ik + idx_key_len, (uint)(iks - idx_key_len), buf); + DBUG_RETURN(ret); + } + } +} + +int ha_tidesdb::index_first(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::index_first"); + + idx_pk_exact_done_ = false; + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + + if (is_pk_) + { + uint8_t data_prefix = KEY_NS_DATA; + tidesdb_iter_seek(scan_iter, &data_prefix, 1); + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + else + { + tidesdb_iter_seek_to_first(scan_iter); + scan_dir_ = DIR_NONE; /* index_next will set DIR_FORWARD */ + DBUG_RETURN(index_next(buf)); + } +} + +int ha_tidesdb::index_last(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::index_last"); + + idx_pk_exact_done_ = false; + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + + if (is_pk_) + { + /* Seek-for-prev(sentinel) lands on the last existing data key in one + operation, where seek_to_last walks every source's max key first + and then we'd still need a backward scan past KEY_NS_META. + KEY_NS_DATA (0x01) sorts after KEY_NS_META (0x00) so any data + key is greater than every meta key, and the sentinel below is + larger than any real data key in the CF. */ + uchar sentinel[DATA_KEY_BUF_LEN]; + sentinel[0] = KEY_NS_DATA; + uint sentinel_len = KEY_NAMESPACE_LEN + share->pk_key_len; + if (sentinel_len > sizeof(sentinel)) sentinel_len = sizeof(sentinel); + memset(sentinel + KEY_NAMESPACE_LEN, KEY_INF_HI_BYTE, sentinel_len - KEY_NAMESPACE_LEN); + tidesdb_iter_seek_for_prev(scan_iter, sentinel, sentinel_len); + /* Defensive backward scan in case the seek lands on a meta key + in a CF with no data rows yet. */ + while (tidesdb_iter_valid(scan_iter)) + { + uint8_t *key = NULL; + size_t ks = 0; + if (tidesdb_iter_key(scan_iter, &key, &ks) != TDB_SUCCESS) + DBUG_RETURN(HA_ERR_END_OF_FILE); + if (is_data_key(key, ks)) break; + tidesdb_iter_prev(scan_iter); + } + scan_dir_ = DIR_BACKWARD; + DBUG_RETURN(iter_read_current(buf)); + } + else + { + /* Secondary CFs hold only index entries; seek_for_prev(0xFF...) + lands on the last one without the per-source max-key walk that + seek_to_last performs. */ + uchar sentinel[SEC_IDX_KEY_BUF_LEN]; + uint sentinel_len = share->idx_comp_key_len[active_index] + share->pk_key_len; + if (sentinel_len > sizeof(sentinel)) sentinel_len = sizeof(sentinel); + memset(sentinel, KEY_INF_HI_BYTE, sentinel_len); + tidesdb_iter_seek_for_prev(scan_iter, sentinel, sentinel_len); + if (!tidesdb_iter_valid(scan_iter)) DBUG_RETURN(HA_ERR_END_OF_FILE); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) DBUG_RETURN(HA_ERR_END_OF_FILE); + + uint idx_key_len = share->idx_comp_key_len[active_index]; + if (iks <= idx_key_len) DBUG_RETURN(HA_ERR_END_OF_FILE); + + scan_dir_ = DIR_BACKWARD; + DBUG_RETURN(fetch_row_by_pk(scan_txn, ik + idx_key_len, (uint)(iks - idx_key_len), buf)); + } +} + +int ha_tidesdb::index_next_same(uchar *buf, const uchar *key, uint keylen) +{ + DBUG_ENTER("ha_tidesdb::index_next_same"); + + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + /* Spatial index continuation */ + if (spatial_scan_active_) + { + if (!scan_iter) DBUG_RETURN(HA_ERR_END_OF_FILE); + tidesdb_iter_next(scan_iter); + DBUG_RETURN(spatial_scan_next(buf)); + } + + if (is_pk_) + { + uint full_pk_comp_len = share->idx_comp_key_len[share->pk_index]; + if (idx_search_comp_len_ >= full_pk_comp_len) + { + /* Full PK is unique -- after the first match there are no more */ + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + + /* Partial PK prefix on a composite PK -- we iterate through data keys + that share this prefix-- KEY_NS_DATA + comparable_pk_prefix... */ + if (!scan_iter) DBUG_RETURN(HA_ERR_END_OF_FILE); + + tidesdb_iter_next(scan_iter); + if (!tidesdb_iter_valid(scan_iter)) DBUG_RETURN(HA_ERR_END_OF_FILE); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) DBUG_RETURN(HA_ERR_END_OF_FILE); + + /* Data key format-- KEY_NS_DATA + comparable_pk. + We check if the PK prefix still matches (skip the namespace byte). */ + if (iks < KEY_NAMESPACE_LEN + idx_search_comp_len_ || + memcmp(ik + KEY_NAMESPACE_LEN, idx_search_comp_, idx_search_comp_len_) != 0) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + int ret = iter_read_current(buf); + if (ret == 0) scan_dir_ = DIR_FORWARD; + DBUG_RETURN(ret); + } + + /* Secondary index -- we advance past the last-read entry, then ICP loop */ + if (!scan_iter) DBUG_RETURN(HA_ERR_END_OF_FILE); + tidesdb_iter_next(scan_iter); + + uint idx_col_len = share->idx_comp_key_len[active_index]; + for (;;) + { + if (!tidesdb_iter_valid(scan_iter)) DBUG_RETURN(HA_ERR_END_OF_FILE); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) DBUG_RETURN(HA_ERR_END_OF_FILE); + + if (iks < idx_search_comp_len_ || memcmp(ik, idx_search_comp_, idx_search_comp_len_) != 0) + { + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + + if (iks <= idx_col_len) DBUG_RETURN(HA_ERR_END_OF_FILE); + + /* ICP -- we evaluate pushed condition before PK lookup */ + check_result_t icp = icp_check_secondary(ik, iks, active_index, buf); + if (icp == CHECK_NEG) + { + tidesdb_iter_next(scan_iter); + continue; + } + if (icp == CHECK_OUT_OF_RANGE) DBUG_RETURN(HA_ERR_END_OF_FILE); + if (icp == CHECK_ABORTED_BY_USER) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + int ret; + if (keyread_only_ && try_keyread_from_index(ik, iks, active_index, buf)) + ret = 0; + else + ret = fetch_row_by_pk(scan_txn, ik + idx_col_len, (uint)(iks - idx_col_len), buf); + DBUG_RETURN(ret); + } +} + +/* ******************** update_row (UPDATE) ******************** */ + +int ha_tidesdb::update_row(const uchar *old_data, const uchar *new_data) +{ + DBUG_ENTER("ha_tidesdb::update_row"); + + MY_BITMAP *old_map = tmp_use_all_columns(table, &table->read_set); + + /* We cache THD and trx once to avoid repeated ha_thd() virtual calls + and thd_get_ha_data() indirect lookups throughout this function. + We use cached_thd_/cached_trx_ set in external_lock to avoid + per-row ha_thd() virtual dispatch and thd_get_ha_data() hash lookup. */ + tidesdb_trx_t *trx = cached_trx_; + + /* We use handler-owned pk buffer for old/new PK to avoid large stack arrays. + old_pk is saved from current_pk_buf_ before we overwrite it. */ + uchar old_pk[MAX_KEY_LENGTH]; + uint old_pk_len = current_pk_len_; + memcpy(old_pk, current_pk_buf_, old_pk_len); + + /* Acquire the X lock on the row we are about to mutate. Under + SELECT ... FOR UPDATE the scan already took it; under UPDATE + reached via a secondary-index/PK range scan we skipped it during + ICP filtering and reacquire here on the actual target. The lock + manager treats repeat acquisitions of an already-held mode as a + cheap no-op so the FOR UPDATE path doesn't pay twice. */ + if (unlikely(srv_pessimistic_locking) && trx) + { + int lrc = row_lock_acquire(trx, old_pk, old_pk_len, cached_thd_, TDB_LOCK_MODE_X); + if (lrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(lrc); + } + } + + /* new_pk uses its own stack buffer so it survives the current_pk_buf_ + manipulations in the secondary index loop (avoids overlapping memcpy UB) */ + uchar new_pk[MAX_KEY_LENGTH]; + uint new_pk_len = pk_from_record(new_data, new_pk); + + const std::string &new_row = serialize_row(new_data); + if (share->encrypted && new_row.empty()) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(HA_ERR_GENERIC); + } + const uint8_t *row_ptr = (const uint8_t *)new_row.data(); + size_t row_len = new_row.size(); + + { + int erc = ensure_stmt_txn(); + if (erc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(erc); + } + } + tidesdb_txn_t *txn = stmt_txn; + stmt_txn_dirty = true; + if (trx) + { + trx->dirty = true; + } + + /* We populate THDVAR cache if not yet done this statement */ + if (!cached_thdvars_valid_) + { + cached_skip_unique_ = THDVAR(cached_thd_, skip_unique_check); + cached_sess_ttl_ = THDVAR(cached_thd_, ttl); + cached_single_delete_primary_ = THDVAR(cached_thd_, single_delete_primary); + cached_thdvars_valid_ = true; + } + + int rc; + bool pk_changed = (old_pk_len != new_pk_len || memcmp(old_pk, new_pk, old_pk_len) != 0); + + /* We compute TTL when the table has TTL configured or the session overrides it. + Uses cached_sess_ttl_ to avoid THDVAR + ha_thd() per row. */ + time_t row_ttl = + (share->has_ttl || cached_sess_ttl_ > 0) ? compute_row_ttl(new_data) : TIDESDB_TTL_NONE; + + /* Uniqueness enforcement. A TidesDB put silently overwrites, so an + UPDATE that moves a row onto an existing primary key would destroy + the colliding row, and one that moves it onto an existing UNIQUE + secondary value would create a duplicate. The server relies on the + engine to surface HA_ERR_FOUND_DUPP_KEY, so these checks run before + any txn mutation and leave the txn untouched on a violation. A + session that set tidesdb_skip_unique_check bypasses them by caller + contract, matching write_row. */ + if (!cached_skip_unique_) + { + if (pk_changed && share->has_user_pk) + { + uchar chk_dk[DATA_KEY_BUF_LEN]; + uint chk_dk_len = build_data_key(new_pk, new_pk_len, chk_dk); + uint8_t *dup_val = NULL; + size_t dup_len = 0; + int grc = tidesdb_txn_get(txn, share->cf, chk_dk, chk_dk_len, &dup_val, &dup_len); + if (grc == TDB_SUCCESS) + { + tidesdb_free(dup_val); + errkey = lookup_errkey = share->pk_index; + memcpy(dup_ref, new_pk, new_pk_len); + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(HA_ERR_FOUND_DUPP_KEY); + } + if (grc != TDB_ERR_NOT_FOUND) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(grc, "update_row pk_dup_check")); + } + } + + if (share->num_secondary_indexes > 0) + { + const my_ptrdiff_t nd_ptrdiff = (my_ptrdiff_t)(new_data - table->record[0]); + for (uint i = 0; i < table->s->keys; i++) + { + if (share->has_user_pk && i == share->pk_index) continue; + if (i >= share->idx_cfs.size() || !share->idx_cfs[i]) continue; + if (share->idx_is_fts[i] || share->idx_is_spatial[i]) continue; + if (!(table->key_info[i].flags & HA_NOSAME)) continue; + + KEY *ki = &table->key_info[i]; + + /* SQL gives NULL no identity, so a UNIQUE index never + constrains a row whose indexed value is NULL in any part. + Skip the check entirely in that case, matching InnoDB. + This also keeps the engine off the server's internal + MHNSW graph table, whose UNIQUE(tref) column is NULL for + the graph metadata rows. */ + bool any_null = false; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + Field *f = ki->key_part[p].field; + if (f->real_maybe_null() && f->is_real_null(nd_ptrdiff)) + { + any_null = true; + break; + } + } + if (any_null) continue; + + /* Compare the old and new comparable index keys. Equal + keys mean the indexed value did not change, so no new + collision is possible no matter whether the primary key + moved. When they differ, this row's own existing entry + sits under the old key, so any entry found under the new + key necessarily belongs to a different row. */ + uchar *old_prefix = upd_old_ik_; + uchar *new_prefix = upd_new_ik_; + uint old_prefix_len = + make_comparable_key(ki, old_data, ki->user_defined_key_parts, old_prefix); + uint new_prefix_len = + make_comparable_key(ki, new_data, ki->user_defined_key_parts, new_prefix); + if (old_prefix_len == new_prefix_len && + memcmp(old_prefix, new_prefix, new_prefix_len) == 0) + continue; + + tidesdb_iter_t *dup_iter = NULL; + int irc = tdb_iter_new_blocking(ha_thd(), txn, share->idx_cfs[i], &dup_iter); + if (irc != TDB_SUCCESS || !dup_iter) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(irc, "update_row dup_iter_new")); + } + + tidesdb_iter_seek(dup_iter, new_prefix, new_prefix_len); + bool dup = false; + if (tidesdb_iter_valid(dup_iter)) + { + uint8_t *fk = NULL; + size_t fks = 0; + if (tidesdb_iter_key(dup_iter, &fk, &fks) == TDB_SUCCESS && + fks >= new_prefix_len && memcmp(fk, new_prefix, new_prefix_len) == 0) + { + dup = true; + size_t suffix_len = fks - new_prefix_len; + if (suffix_len > 0 && suffix_len <= ref_length) + memcpy(dup_ref, fk + new_prefix_len, suffix_len); + } + } + tidesdb_iter_free(dup_iter); + + if (dup) + { + errkey = lookup_errkey = i; + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(HA_ERR_FOUND_DUPP_KEY); + } + } + } + } + + /* If PK changed, we delete old entry and insert new */ + if (pk_changed) + { + uchar old_dk[DATA_KEY_BUF_LEN]; + uint old_dk_len = build_data_key(old_pk, old_pk_len, old_dk); + rc = tdb_txn_delete_cf_blocking(cached_thd_, txn, share->cf, old_dk, old_dk_len, + cached_single_delete_primary_); + if (rc != TDB_SUCCESS) goto err; + } + + { + uchar new_dk[DATA_KEY_BUF_LEN]; + uint new_dk_len = build_data_key(new_pk, new_pk_len, new_dk); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->cf, new_dk, new_dk_len, row_ptr, row_len, + row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + + /* Single consolidated dispatch over secondary indexes. Regular, FTS, + and spatial branches share one walk of table->s->keys. Each branch + short-circuits via a write_set pre-check so unchanged indexes skip + both key construction and LSM writes. */ + if (share->num_secondary_indexes > 0) + { + /* We use handler-owned buffers to avoid per-row heap allocation + and keep the stack frame within -Wframe-larger-than limits. */ + uchar *old_ik = upd_old_ik_; + uchar *new_ik = upd_new_ik_; + const uint num_keys = table->s->keys; + const bool has_user_pk = share->has_user_pk; + const uint pk_index = share->pk_index; + const size_t idx_cfs_sz = share->idx_cfs.size(); + + for (uint i = 0; i < num_keys; i++) + { + if (has_user_pk && i == pk_index) continue; + if (i >= idx_cfs_sz || !share->idx_cfs[i]) continue; + + KEY *ki = &table->key_info[i]; + + if (share->idx_is_fts[i]) + { + /* We skip if no indexed column actually changed */ + bool fts_changed = false; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + uint fieldnr = ki->key_part[p].fieldnr - 1; + if (bitmap_is_set(table->write_set, fieldnr)) + { + fts_changed = true; + break; + } + } + if (!fts_changed) continue; + + CHARSET_INFO *fts_cs = ki->key_part[0].field->charset(); + + /* Tokenize both old and new docs, build term-frequency maps, + then emit only the minimum set of deletes/puts needed. + For a small edit to a large document this avoids + rewriting every term entry. */ + std::vector old_tokens, new_tokens; + fts_extract_and_tokenize(table, ki, old_data, fts_cs, old_tokens); + fts_extract_and_tokenize(table, ki, new_data, fts_cs, new_tokens); + + std::unordered_map old_tf, new_tf; + for (auto &tok : old_tokens) old_tf[tok.word]++; + for (auto &tok : new_tokens) new_tf[tok.word]++; + uint32 old_wc = (uint32)old_tokens.size(); + uint32 new_wc = (uint32)new_tokens.size(); + + if (pk_changed) + { + /* PK changed -- the row identity changed so every old + (term, old_pk) must be deleted and every new (term, new_pk) + inserted. No diffing possible across different PKs. */ + for (auto &kv : old_tf) + { + const auto &term = kv.first; + uchar fk[FTS_KEY_BUF_LEN]; + uint fk_len = + fts_build_key(term.data(), (uint)term.size(), old_pk, old_pk_len, fk); + tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], fk, fk_len, + true); + } + for (auto &kv : new_tf) + { + const auto &term = kv.first; + auto &tf = kv.second; + uchar fk[FTS_KEY_BUF_LEN]; + uint fk_len = + fts_build_key(term.data(), (uint)term.size(), new_pk, new_pk_len, fk); + uchar fv[FTS_VALUE_LEN]; + fts_build_value(tf, new_wc, fv); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], fk, fk_len, + fv, FTS_VALUE_LEN, row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + } + else + { + /* PK stable -- apply term-level diff. Only delete a term + when it disappears and only write a term when it is new, + its tf changes, or doc_len changes (doc_len is part of + the stored value used by BM25). */ + bool doc_len_changed = (old_wc != new_wc); + + for (auto &kv : old_tf) + { + const auto &term = kv.first; + if (new_tf.find(term) != new_tf.end()) continue; + uchar fk[FTS_KEY_BUF_LEN]; + uint fk_len = + fts_build_key(term.data(), (uint)term.size(), old_pk, old_pk_len, fk); + tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], fk, fk_len, + true); + } + + for (auto &kv : new_tf) + { + const auto &term = kv.first; + auto &new_cnt = kv.second; + auto it = old_tf.find(term); + bool need_put; + if (it == old_tf.end()) + need_put = true; + else if (doc_len_changed) + need_put = true; + else + need_put = (it->second != new_cnt); + + if (!need_put) continue; + + uchar fk[FTS_KEY_BUF_LEN]; + uint fk_len = + fts_build_key(term.data(), (uint)term.size(), new_pk, new_pk_len, fk); + uchar fv[FTS_VALUE_LEN]; + fts_build_value(new_cnt, new_wc, fv); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], fk, fk_len, + fv, FTS_VALUE_LEN, row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + } + + /* The doc count stays the same, only the word count moves. + Fold into the txn-level accumulator which flushes before + commit so the meta update lands in the same txn as the + row updates that produced it. */ + int64_t wc_delta = (int64_t)new_wc - (int64_t)old_wc; + if (wc_delta != 0) trx_fts_meta_accumulate(trx, share->cf, i, 0, wc_delta); + } + else if (share->idx_is_spatial[i]) + { + /* Skip when the geometry column is unchanged. */ + uint fieldnr = ki->key_part[0].fieldnr - 1; + if (!bitmap_is_set(table->write_set, fieldnr)) continue; + + Field *geom_field = ki->key_part[0].field; + + /* Delete old spatial entry */ + { + my_ptrdiff_t ptd = (my_ptrdiff_t)(old_data - table->record[0]); + if (ptd) geom_field->move_field_offset(ptd); + String gs; + geom_field->val_str(&gs, &gs); + if (ptd) geom_field->move_field_offset(-ptd); + double xmn, ymn, xmx, ymx; + if (gs.length() > 0 && spatial_compute_mbr((const uchar *)gs.ptr(), gs.length(), + &xmn, &ymn, &xmx, &ymx)) + { + uchar sk[SPATIAL_HILBERT_KEY_LEN + MAX_KEY_LENGTH]; + uint sk_len = spatial_build_key((xmn + xmx) / MBR_CENTROID_DIV, + (ymn + ymx) / MBR_CENTROID_DIV, old_pk, + old_pk_len, sk); + tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], sk, sk_len, + true); + } + } + + /* Insert new spatial entry */ + { + my_ptrdiff_t ptd = (my_ptrdiff_t)(new_data - table->record[0]); + if (ptd) geom_field->move_field_offset(ptd); + String gs; + geom_field->val_str(&gs, &gs); + if (ptd) geom_field->move_field_offset(-ptd); + double xmn, ymn, xmx, ymx; + if (gs.length() > 0 && spatial_compute_mbr((const uchar *)gs.ptr(), gs.length(), + &xmn, &ymn, &xmx, &ymx)) + { + uchar sk[SPATIAL_HILBERT_KEY_LEN + MAX_KEY_LENGTH]; + uint sk_len = spatial_build_key((xmn + xmx) / MBR_CENTROID_DIV, + (ymn + ymx) / MBR_CENTROID_DIV, new_pk, + new_pk_len, sk); + uchar sv[SPATIAL_MBR_VALUE_LEN]; + spatial_build_value(xmn, ymn, xmx, ymx, sv); + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], sk, sk_len, + sv, SPATIAL_MBR_VALUE_LEN, row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + } + } + else + { + /* Regular secondary index -- skip before building keys when + no indexed column changed and the PK is stable. Saves + the per-row make_comparable_key / sec_idx_key cost on + wide updates that touch only unrelated columns. */ + if (!pk_changed) + { + bool idx_changed = false; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + uint fieldnr = ki->key_part[p].fieldnr - 1; + if (bitmap_is_set(table->write_set, fieldnr)) + { + idx_changed = true; + break; + } + } + if (!idx_changed) continue; + } + + /* We build old index entry key. current_pk_buf_ is transiently + set to old/new PK so sec_idx_key's pk_from_record path works + for hidden-PK tables. */ + memcpy(current_pk_buf_, old_pk, old_pk_len); + current_pk_len_ = old_pk_len; + uint old_ik_len = + make_comparable_key(ki, old_data, ki->user_defined_key_parts, old_ik); + memcpy(old_ik + old_ik_len, old_pk, old_pk_len); + old_ik_len += old_pk_len; + + memcpy(current_pk_buf_, new_pk, new_pk_len); + current_pk_len_ = new_pk_len; + uint new_ik_len = sec_idx_key(i, new_data, new_ik); + + if (old_ik_len == new_ik_len && memcmp(old_ik, new_ik, old_ik_len) == 0) continue; + + rc = tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], old_ik, + old_ik_len, true); + if (rc != TDB_SUCCESS) goto err; + rc = tdb_txn_put_blocking(cached_thd_, txn, share->idx_cfs[i], new_ik, new_ik_len, + &tdb_empty_val, sizeof(tdb_empty_val), row_ttl); + if (rc != TDB_SUCCESS) goto err; + } + } + } + + memcpy(current_pk_buf_, new_pk, new_pk_len); + current_pk_len_ = new_pk_len; + + /* Bulk UPDATE mid-txn commit. Symmetric to write_row's bulk path. + One UPDATE op counts as 1 data put + 1 data delete (when PK changed) + + up to num_secondary_indexes entries rewritten. We overestimate as + `1 + 2 * num_secondary_indexes` */ + if (in_bulk_update_) + { + bulk_insert_ops_ += 1 + 2 * (ha_rows)share->num_secondary_indexes; + if (bulk_insert_ops_ >= TIDESDB_BULK_INSERT_BATCH_OPS) + { + int mrc = maybe_bulk_commit(trx); + if (mrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(mrc); + } + bulk_insert_ops_ = 0; + } + } + + /* Commit happens in external_lock(F_UNLCK). */ + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(0); + +err: + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(rc, "update_row")); +} + +/* ******************** delete_row (DELETE) ******************** */ + +int ha_tidesdb::delete_row(const uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::delete_row"); + + MY_BITMAP *old_map = tmp_use_all_columns(table, &table->read_set); + + /* We use cached_trx_ from external_lock to avoid per-row hash lookups. */ + tidesdb_trx_t *trx = cached_trx_; + + /* Acquire the X lock on the target row. See update_row for the + rationale-- iter_read_current skips the lock during UPDATE/DELETE + ICP filtering so unrelated rows on a range scan are not blocked. */ + if (unlikely(srv_pessimistic_locking) && trx) + { + int lrc = + row_lock_acquire(trx, current_pk_buf_, current_pk_len_, cached_thd_, TDB_LOCK_MODE_X); + if (lrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(lrc); + } + } + + /* We populate THDVAR cache if not yet done this statement. A pure DELETE + reaches delete_row without first going through write_row/update_row, so + the cache may still be stale from the prior statement. */ + if (!cached_thdvars_valid_) + { + cached_skip_unique_ = THDVAR(cached_thd_, skip_unique_check); + cached_sess_ttl_ = THDVAR(cached_thd_, ttl); + cached_single_delete_primary_ = THDVAR(cached_thd_, single_delete_primary); + cached_compact_after_range_delete_min_rows_ = + THDVAR(cached_thd_, compact_after_range_delete_min_rows); + cached_thdvars_valid_ = true; + } + + { + int erc = ensure_stmt_txn(); + if (erc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(erc); + } + } + tidesdb_txn_t *txn = stmt_txn; + stmt_txn_dirty = true; + if (trx) + { + trx->dirty = true; + } + + uchar dk[DATA_KEY_BUF_LEN]; + uint dk_len = build_data_key(current_pk_buf_, current_pk_len_, dk); + + /* Track the touched data-key range when the auto-compact session var + is on and we are inside a multi-row DELETE. We compare the full + data keys (KEY_NS_DATA + comparable_pk) so the recorded bounds can + be passed to tidesdb_compact_range without further conversion. */ + if (in_bulk_delete_ && cached_compact_after_range_delete_min_rows_ > 0) + { + const std::string this_key((const char *)dk, dk_len); + if (bulk_delete_rows_ == 0) + { + bulk_delete_min_pk_ = this_key; + bulk_delete_max_pk_ = this_key; + } + else + { + if (this_key < bulk_delete_min_pk_) bulk_delete_min_pk_ = this_key; + if (this_key > bulk_delete_max_pk_) bulk_delete_max_pk_ = this_key; + } + bulk_delete_rows_++; + } + + int rc = tdb_txn_delete_cf_blocking(cached_thd_, txn, share->cf, dk, dk_len, + cached_single_delete_primary_); + if (rc != TDB_SUCCESS) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(rc, "delete_row")); + } + + /* We delete secondary index entries in a single consolidated dispatch loop. + Regular, FTS, and spatial indexes are handled inline. */ + if (share->num_secondary_indexes > 0) + { + const uint num_keys = table->s->keys; + const bool has_user_pk = share->has_user_pk; + const uint pk_index = share->pk_index; + const size_t idx_cfs_sz = share->idx_cfs.size(); + + for (uint i = 0; i < num_keys; i++) + { + if (has_user_pk && i == pk_index) continue; + if (i >= idx_cfs_sz || !share->idx_cfs[i]) continue; + + KEY *ki = &table->key_info[i]; + + if (share->idx_is_fts[i]) + { + CHARSET_INFO *fts_cs = ki->key_part[0].field->charset(); + std::vector fts_tokens; + fts_extract_and_tokenize(table, ki, buf, fts_cs, fts_tokens); + + std::unordered_map tf_map; + for (auto &tok : fts_tokens) tf_map[tok.word]++; + uint32 word_count = (uint32)fts_tokens.size(); + + for (auto &kv : tf_map) + { + const auto &term = kv.first; + uchar fk[FTS_KEY_BUF_LEN]; + uint fk_len = fts_build_key(term.data(), (uint)term.size(), current_pk_buf_, + current_pk_len_, fk); + tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], fk, fk_len, + true); + } + + trx_fts_meta_accumulate(trx, share->cf, i, FTS_DOC_DELTA_DEL, -(int64_t)word_count); + } + else if (share->idx_is_spatial[i]) + { + Field *geom_field = ki->key_part[0].field; + my_ptrdiff_t ptd = (my_ptrdiff_t)(buf - table->record[0]); + if (ptd) geom_field->move_field_offset(ptd); + String geom_str; + geom_field->val_str(&geom_str, &geom_str); + if (ptd) geom_field->move_field_offset(-ptd); + + double xmin, ymin, xmax, ymax; + if (geom_str.length() > 0 && + spatial_compute_mbr((const uchar *)geom_str.ptr(), geom_str.length(), &xmin, + &ymin, &xmax, &ymax)) + { + double cx = (xmin + xmax) / MBR_CENTROID_DIV; + double cy = (ymin + ymax) / MBR_CENTROID_DIV; + uchar sk[SPATIAL_HILBERT_KEY_LEN + MAX_KEY_LENGTH]; + uint sk_len = spatial_build_key(cx, cy, current_pk_buf_, current_pk_len_, sk); + tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], sk, sk_len, + true); + } + } + else + { + uchar ik[SEC_IDX_KEY_BUF_LEN]; + uint ik_len = sec_idx_key(i, buf, ik); + rc = tdb_txn_delete_cf_blocking(cached_thd_, txn, share->idx_cfs[i], ik, ik_len, + true); + if (rc != TDB_SUCCESS) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(tdb_rc_to_ha(rc, "delete_row idx")); + } + } + } + } + + /* Bulk DELETE mid-txn commit-- 1 data delete + num_secondary_indexes + secondary-index deletes per row. */ + if (in_bulk_delete_) + { + bulk_insert_ops_ += 1 + (ha_rows)share->num_secondary_indexes; + if (bulk_insert_ops_ >= TIDESDB_BULK_INSERT_BATCH_OPS) + { + int mrc = maybe_bulk_commit(trx); + if (mrc) + { + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(mrc); + } + bulk_insert_ops_ = 0; + } + } + + tmp_restore_column_map(&table->read_set, old_map); + DBUG_RETURN(0); +} + +/* ******************** delete_all_rows (TRUNCATE) ******************** */ + +int ha_tidesdb::delete_all_rows(void) +{ + DBUG_ENTER("ha_tidesdb::delete_all_rows"); + + /* We free cached iterators before dropping/recreating CFs. + The iterators hold refs to SSTables in the CFs being dropped. */ + if (scan_iter) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + free_dup_iter_cache(); + + /* We discard the connection txn before drop/recreate. The txn may have + buffered INSERT/UPDATE ops from earlier statements; committing them + after the CF is recreated would re-insert stale data. */ + { + THD *thd = ha_thd(); + tidesdb_trx_t *trx = (tidesdb_trx_t *)thd_get_ha_data(thd, ht); + if (trx && trx->txn) + { + tidesdb_txn_rollback(trx->txn); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + trx->dirty = false; + trx->fts_meta_pending.clear(); + trx->fts_meta_dirty = false; + } + stmt_txn = NULL; + stmt_txn_dirty = false; + } + + tidesdb_column_family_config_t cfg = build_cf_config(TDB_TABLE_OPTIONS(table)); + + { + std::string cf_name = share->cf_name; + int rc = tidesdb_drop_column_family(tdb_global, cf_name.c_str()); + if (rc != TDB_SUCCESS && rc != TDB_ERR_NOT_FOUND) + { + sql_print_error("[TIDESDB] truncate: failed to drop CF '%s' (err=%d)", cf_name.c_str(), + rc); + DBUG_RETURN(tdb_rc_to_ha(rc, "truncate drop_cf")); + } + + rc = tidesdb_create_column_family(tdb_global, cf_name.c_str(), &cfg); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] truncate: failed to recreate CF '%s' (err=%d)", + cf_name.c_str(), rc); + DBUG_RETURN(tdb_rc_to_ha(rc, "truncate create_cf")); + } + + share->cf = tidesdb_get_column_family(tdb_global, cf_name.c_str()); + if (!share->cf) + { + sql_print_error("[TIDESDB] truncate: CF '%s' not found after recreate", + cf_name.c_str()); + DBUG_RETURN(HA_ERR_GENERIC); + } + } + + for (uint i = 0; i < share->idx_cfs.size(); i++) + { + if (!share->idx_cfs[i]) continue; + + const std::string &idx_name = share->idx_cf_names[i]; + tidesdb_drop_column_family(tdb_global, idx_name.c_str()); + + tidesdb_column_family_config_t idx_cfg = cfg; + if (i < table->s->keys && table->key_info[i].option_struct) + { + ha_index_option_struct *iopts = table->key_info[i].option_struct; + idx_cfg.use_btree = iopts->use_btree ? 1 : 0; + } + + int rc = tidesdb_create_column_family(tdb_global, idx_name.c_str(), &idx_cfg); + if (rc != TDB_SUCCESS) + { + sql_print_warning("[TIDESDB] truncate: failed to recreate idx CF '%s' (err=%d)", + idx_name.c_str(), rc); + share->idx_cfs[i] = NULL; + continue; + } + + share->idx_cfs[i] = tidesdb_get_column_family(tdb_global, idx_name.c_str()); + } + + share->next_row_id.store(HIDDEN_PK_FIRST_ROW_ID, std::memory_order_relaxed); + + DBUG_RETURN(0); +} + +/* ******************** Bulk DML ******************** */ + +/* + Commit the current txn mid-statement and reset it with READ_COMMITTED so + the next batch starts fresh. Shared by bulk INSERT/UPDATE/DELETE once + buffered ops cross TIDESDB_BULK_INSERT_BATCH_OPS -- keeps us under + TDB_MAX_TXN_OPS and bounds txn memory. Higher isolation levels would + cause unbounded read-set growth across batches. + + Any cached iterators and dup-check iterators are invalidated, they hold + references to MERGE_SOURCE_TXN_OPS that txn_reset clears. + + If the inner commit fails (e.g. transient TDB_ERR_UNKNOWN from a unified + memtable rotation race) we MUST surface that to the SQL layer. Returning + 0 here while the buffered ops are gone causes silent data loss -- the + caller (write_row / update_row / delete_row) reports success even though + up to TIDESDB_BULK_INSERT_BATCH_OPS rows were dropped on the floor. + Instead, rollback to release the txn's state, swap in a fresh txn so the + connection is left in a valid state for any retry, and propagate the + error code so MariaDB rolls the statement back and surfaces it to the + client (typically as ER_ERROR_DURING_COMMIT). +*/ +int ha_tidesdb::maybe_bulk_commit(tidesdb_trx_t *trx) +{ + if (!trx || !trx->txn) return 0; + + /* Folded FTS meta deltas have to land in the same txn as the row puts + they account for, so flush them before the mid-statement commit. */ + int frc = flush_trx_fts_meta_pending(cached_thd_, trx); + if (frc != TDB_SUCCESS) return tdb_rc_to_ha(frc, "bulk_commit fts_meta_flush"); + + int crc = tdb_txn_commit_blocking(cached_thd_, trx->txn); + if (crc != TDB_SUCCESS) + { + sql_print_error( + "[TIDESDB] bulk mid-commit failed rc=%d -- aborting statement to " + "avoid silent row loss", + crc); + /* Release plugin-level row locks. The lock-request structs were + allocated against the txn that just failed; leaving them on + held_locks_head would expose dangling memory once we tidesdb_txn_free + the underlying txn below. After release we are safe to swap in a + fresh txn. */ + row_locks_release_all(trx); + /* Release the txn's buffered state. Even if rollback itself fails + we still free+begin below so the connection is usable. */ + (void)tidesdb_txn_rollback(trx->txn); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + int brc = + tidesdb_txn_begin_with_isolation(tdb_global, TDB_ISOLATION_READ_COMMITTED, &trx->txn); + if (brc != TDB_SUCCESS) return tdb_rc_to_ha(brc, "bulk_commit txn_begin(after_fail)"); + trx->txn_generation++; + stmt_txn = trx->txn; + scan_txn = trx->txn; + if (scan_iter) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + free_dup_iter_cache(); + return tdb_rc_to_ha(crc, "bulk_commit"); + } + + /* Successful mid-statement commit. The library has released its + internal locks for the just-committed txn, so the plugin-level + locks no longer correspond to anything serializable. Drop them + before the reset so a stalled cursor on the same connection cannot + see locks attributed to a txn that no longer exists. */ + row_locks_release_all(trx); + + int rrc = tidesdb_txn_reset(trx->txn, TDB_ISOLATION_READ_COMMITTED); + if (rrc != TDB_SUCCESS) + { + sql_print_warning( + "[TIDESDB] bulk tidesdb_txn_reset failed (rc=%d), falling back to " + "free+begin", + rrc); + tidesdb_txn_free(trx->txn); + trx->txn = NULL; + int rc = + tidesdb_txn_begin_with_isolation(tdb_global, TDB_ISOLATION_READ_COMMITTED, &trx->txn); + if (rc != TDB_SUCCESS) return tdb_rc_to_ha(rc, "bulk_commit txn_begin"); + } + + stmt_txn = trx->txn; + trx->txn_generation++; + + if (scan_iter) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + free_dup_iter_cache(); + scan_txn = trx->txn; + return 0; +} + +void ha_tidesdb::start_bulk_insert(ha_rows rows, uint flags) +{ + in_bulk_insert_ = true; + bulk_insert_ops_ = 0; +} + +int ha_tidesdb::end_bulk_insert() +{ + in_bulk_insert_ = false; + return 0; +} + +/* + start_bulk_update returns 0 when the engine will handle bulk batching. + We then flip the flag that update_row checks at its tail so every row + contributes to the shared ops counter. +*/ +bool ha_tidesdb::start_bulk_update() +{ + in_bulk_update_ = true; + bulk_insert_ops_ = 0; + return 0; +} + +int ha_tidesdb::end_bulk_update() +{ + in_bulk_update_ = false; + return 0; +} + +/* + MariaDB calls bulk_update_row instead of update_row when start_bulk_update + returned 0. We don't actually buffer rows (TidesDB's txn is the buffer); + we just delegate so the standard update_row path runs and its tail-side + mid-commit block batches. dup_key_found tracks duplicate-key collisions + found in buffered-but-not-yet-applied rows -- since we apply immediately, + it's always zero. +*/ +int ha_tidesdb::bulk_update_row(const uchar *old_data, const uchar *new_data, + ha_rows *dup_key_found) +{ + DBUG_ENTER("ha_tidesdb::bulk_update_row"); + if (dup_key_found) *dup_key_found = 0; + DBUG_RETURN(update_row(old_data, new_data)); +} + +bool ha_tidesdb::start_bulk_delete() +{ + in_bulk_delete_ = true; + bulk_insert_ops_ = 0; + bulk_delete_rows_ = 0; + bulk_delete_min_pk_.clear(); + bulk_delete_max_pk_.clear(); + return 0; +} + +int ha_tidesdb::end_bulk_delete() +{ + in_bulk_delete_ = false; + + /* Auto compact-after-range-delete. Threshold zero (default) keeps the + previous behavior, i.e. no synchronous compaction at end-of-statement. + When the threshold is met we call tidesdb_compact_range over the + observed [min_pk, max_pk] data-key range on the primary CF. Secondary + index tombstones are reclaimed by the per-CF tombstone_density_trigger + on those CFs. */ + if (cached_compact_after_range_delete_min_rows_ > 0 && + bulk_delete_rows_ >= cached_compact_after_range_delete_min_rows_ && share && share->cf && + !bulk_delete_min_pk_.empty() && !bulk_delete_max_pk_.empty()) + { + int crc = tidesdb_compact_range( + share->cf, (const uint8_t *)bulk_delete_min_pk_.data(), bulk_delete_min_pk_.size(), + (const uint8_t *)bulk_delete_max_pk_.data(), bulk_delete_max_pk_.size()); + /* TDB_ERR_LOCKED is benign here -- another compaction is already + running over a superset of our range, so our reclamation + request will be absorbed by it. Only log real failures. */ + if (crc != TDB_SUCCESS && crc != TDB_ERR_LOCKED) + { + sql_print_warning( + "[TIDESDB] post-DELETE compact_range on '%s' failed (rows=%llu, err=%d)", + share->cf_name.c_str(), (unsigned long long)bulk_delete_rows_, crc); + } + } + + bulk_delete_rows_ = 0; + bulk_delete_min_pk_.clear(); + bulk_delete_max_pk_.clear(); + return 0; +} + +/* ******************** Index Condition Pushdown (ICP) ******************** */ + +Item *ha_tidesdb::idx_cond_push(uint keyno, Item *idx_cond) +{ + DBUG_ENTER("ha_tidesdb::idx_cond_push"); + + /* We accept the pushed condition, the server will evaluate it for us + during index scans via handler::pushed_idx_cond. For secondary + index scans the condition is checked before the PK lookup, saving + the most expensive operation when the condition filters rows. */ + pushed_idx_cond = idx_cond; + pushed_idx_cond_keyno = keyno; + in_range_check_pushed_down = true; + + DBUG_RETURN(NULL); +} + +/* ******************** Multi-Range Read (MRR) ******************** */ + +/* + Decide whether to accept a custom MRR strategy. We only handle the case + where every range the optimizer hands us is a full-key point lookup + (UNIQUE_RANGE|EQ_RANGE) -- typically `WHERE col IN (v1, v2, ...)` on a + PK or full-key unique index. For mixed or true-range sequences we leave + HA_MRR_USE_DEFAULT_IMPL set so the handler::multi_range_read_* default + path runs unchanged. + + Iterating the sequence here consumes it; MariaDB re-initialises it before + calling multi_range_read_init, so probing is safe. +*/ +ha_rows ha_tidesdb::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges_arg, uint *bufsz, uint *mrr_mode, + ha_rows limit, Cost_estimate *cost) +{ + /* We compute the default cost + flags first so non-accepted sequences fall + through to the server's MRR->read_range_first path with correct costing. */ + ha_rows rows = handler::multi_range_read_info_const(keyno, seq, seq_init_param, n_ranges_arg, + bufsz, mrr_mode, limit, cost); + if (rows == HA_POS_ERROR) return rows; + + /* Partitioned tables are served by ha_partition, which dispatches + multi_range_read_* across child handlers using its own DS-MRR-backed + logic. If we clear HA_MRR_USE_DEFAULT_IMPL here, ha_partition's + ordered-index-scan path ends up invoking our custom _next without + the state its own ordering logic expects and crashes. Refuse to + accept MRR for partitioned tables -- the default path runs correctly. */ +#ifdef WITH_PARTITION_STORAGE_ENGINE + if (table && table->part_info) return rows; +#endif + + /* Probe the sequence, we accept only if every range is a full single-point + equality. A single non-point range forces us back to the default path. */ + KEY_MULTI_RANGE range; + range_seq_t it = seq->init(seq_init_param, n_ranges_arg, *mrr_mode); + bool all_point = true; + uint count = 0; + while (!seq->next(it, &range)) + { + count++; + if (!(range.range_flag & UNIQUE_RANGE) || (range.range_flag & NULL_RANGE) || + !(range.range_flag & EQ_RANGE)) + { + all_point = false; + break; + } + } + + /* We only accept when there are multiple ranges. For a single point lookup + the optimizer's eq_ref plan (plain index_read_map) is a better fit and + -- critically -- also the only path where pessimistic row locking + engages. Accepting MRR for 1-range scans silently converts UPDATE + WHERE pk=v into a range scan that bypasses that lock. */ + if (all_point && count >= MRR_ACCEPT_MIN_RANGES) + { + *mrr_mode &= ~HA_MRR_USE_DEFAULT_IMPL; + *bufsz = 0; /* we use our own std::vector, not HANDLER_BUFFER */ + } + return rows; +} + +/* + Build the sorted list of point lookups, or fall through to the default + impl if HA_MRR_USE_DEFAULT_IMPL is still set. Sorting by comparable + bytes converts N scattered LSM seeks into a monotone stream -- much + friendlier to the block cache and the merge-heap. +*/ +int ha_tidesdb::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges, + uint mrr_mode, HANDLER_BUFFER *buf) +{ + DBUG_ENTER("ha_tidesdb::multi_range_read_init"); + + mrr_custom_active_ = !(mrr_mode & HA_MRR_USE_DEFAULT_IMPL); + if (!mrr_custom_active_) + DBUG_RETURN(handler::multi_range_read_init(seq, seq_init_param, n_ranges, mrr_mode, buf)); + + mrr_entries_.clear(); + mrr_next_idx_ = 0; + mrr_keyno_ = active_index; + mrr_no_assoc_ = MY_TEST(mrr_mode & HA_MRR_NO_ASSOCIATION); + if (n_ranges > 0) mrr_entries_.reserve(n_ranges); + + KEY *ki = &table->key_info[mrr_keyno_]; + + /* We need all columns readable while translating the caller's key_copy + bytes into our comparable format (key_copy_to_comparable calls + key_restore into record[1] and reads fields). */ + MY_BITMAP *old_map = tmp_use_all_columns(table, &table->read_set); + + KEY_MULTI_RANGE range; + range_seq_t it = seq->init(seq_init_param, n_ranges, mrr_mode); + while (!seq->next(it, &range)) + { + uchar comp[MAX_KEY_LENGTH]; + uint comp_len = + key_copy_to_comparable(ki, range.start_key.key, range.start_key.length, comp); + + tdb_mrr_entry e; + e.comp_key.assign((const char *)comp, comp_len); + e.ptr = range.ptr; + mrr_entries_.push_back(std::move(e)); + } + + tmp_restore_column_map(&table->read_set, old_map); + + std::sort(mrr_entries_.begin(), mrr_entries_.end(), + [](const tdb_mrr_entry &a, const tdb_mrr_entry &b) + { return a.comp_key < b.comp_key; }); + + DBUG_RETURN(0); +} + +/* + Deliver the next row from the sorted list of point lookups. PK lookups + bypass the iterator entirely via fetch_row_by_pk; secondary index lookups + reuse the cached scan iterator and a single seek per entry. Rows that + the index knew about but the data CF no longer has (stale entries after + concurrent delete) are silently skipped. +*/ +int ha_tidesdb::multi_range_read_next(range_id_t *range_info) +{ + DBUG_ENTER("ha_tidesdb::multi_range_read_next"); + + if (!mrr_custom_active_) DBUG_RETURN(handler::multi_range_read_next(range_info)); + + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + /* Lazy txn -- the optimizer may invoke MRR without a prior rnd_init / index_init. */ + int erc = ensure_stmt_txn(); + if (erc) DBUG_RETURN(erc); + if (!scan_txn) scan_txn = stmt_txn; + + bool is_pk_scan = share->has_user_pk && mrr_keyno_ == share->pk_index; + uint idx_col_len = share->idx_comp_key_len[mrr_keyno_]; + + while (mrr_next_idx_ < mrr_entries_.size()) + { + const tdb_mrr_entry &e = mrr_entries_[mrr_next_idx_++]; + if (!mrr_no_assoc_) *range_info = e.ptr; + + if (is_pk_scan) + { + int rc = fetch_row_by_pk(scan_txn, (const uchar *)e.comp_key.data(), + (uint)e.comp_key.size(), table->record[0]); + if (rc == HA_ERR_KEY_NOT_FOUND) continue; /* stale range, try next */ + DBUG_RETURN(rc); + } + + /* Secondary index point lookup -- seek, verify prefix match, then + either cover-read from the index or PK-fetch. */ + if (mrr_keyno_ >= share->idx_cfs.size() || !share->idx_cfs[mrr_keyno_]) + continue; /* missing CF for this index -- skip defensively */ + scan_cf_ = share->idx_cfs[mrr_keyno_]; + int irc = ensure_scan_iter(); + if (irc) DBUG_RETURN(irc); + + tidesdb_iter_seek(scan_iter, (const uint8_t *)e.comp_key.data(), (uint)e.comp_key.size()); + if (!tidesdb_iter_valid(scan_iter)) continue; + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) continue; + if (iks < e.comp_key.size() || memcmp(ik, e.comp_key.data(), e.comp_key.size()) != 0) + continue; /* no entry for this point */ + if (iks <= idx_col_len) continue; + + int rc; + if (keyread_only_ && try_keyread_from_index(ik, iks, mrr_keyno_, table->record[0])) + rc = 0; + else + rc = fetch_row_by_pk(scan_txn, ik + idx_col_len, (uint)(iks - idx_col_len), + table->record[0]); + if (rc == HA_ERR_KEY_NOT_FOUND) continue; + DBUG_RETURN(rc); + } + + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + +/* ******************** info ******************** */ + +int ha_tidesdb::info(uint flag) +{ + DBUG_ENTER("ha_tidesdb::info"); + + if (share) ref_length = share->pk_key_len; + + if ((flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST)) && share && share->cf) + { + long long now = (long long)microsecond_interval_timer(); + long long last = share->stats_refresh_us.load(std::memory_order_relaxed); + if (now - last > TIDESDB_STATS_REFRESH_US && + share->stats_refresh_us.compare_exchange_weak(last, now, std::memory_order_relaxed)) + { + tidesdb_stats_t *st = NULL; + if (tidesdb_get_stats(share->cf, &st) == TDB_SUCCESS && st) + { + share->cached_records.store(st->total_keys, std::memory_order_relaxed); + + /* total_data_size only counts SSTable klog+vlog; memtable_size + holds the active memtable footprint. Sum both so that + DATA_LENGTH in information_schema.TABLES is non-zero even + before the first flush. When both are 0 (library gap), + fall back to total_keys * avg entry size. */ + uint64_t data_sz = st->total_data_size + (uint64_t)st->memtable_size; + if (data_sz == 0 && st->total_keys > 0) + data_sz = (uint64_t)(st->total_keys * (st->avg_key_size + st->avg_value_size)); + share->cached_data_size.store(data_sz, std::memory_order_relaxed); + uint32_t mrl = (uint32_t)(st->avg_key_size + st->avg_value_size); + if (mrl == 0) mrl = table->s->reclength; + share->cached_mean_rec_len.store(mrl, std::memory_order_relaxed); + share->cached_read_amp.store(st->read_amp > 0 ? st->read_amp : READ_AMP_NONE, + std::memory_order_relaxed); + + /* We sum secondary index CF sizes for index_file_length */ + uint64_t idx_total = 0; + for (uint i = 0; i < share->idx_cfs.size(); i++) + { + if (!share->idx_cfs[i]) continue; + tidesdb_stats_t *ist = NULL; + if (tidesdb_get_stats(share->idx_cfs[i], &ist) == TDB_SUCCESS && ist) + { + uint64_t isz = ist->total_data_size + (uint64_t)ist->memtable_size; + if (isz == 0 && ist->total_keys > 0) + isz = (uint64_t)(ist->total_keys * + (ist->avg_key_size + ist->avg_value_size)); + idx_total += isz; + tidesdb_free_stats(ist); + } + } + share->cached_idx_data_size.store(idx_total, std::memory_order_relaxed); + + tidesdb_free_stats(st); + } + + /* Also refresh SHOW GLOBAL STATUS variables while we're updating stats */ + tidesdb_refresh_status_vars(); + } + + stats.records = share->cached_records.load(std::memory_order_relaxed); + if (stats.records == 0) stats.records = TIDESDB_MIN_STATS_RECORDS; + stats.data_file_length = share->cached_data_size.load(std::memory_order_relaxed); + stats.index_file_length = share->cached_idx_data_size.load(std::memory_order_relaxed); + stats.mean_rec_length = share->cached_mean_rec_len.load(std::memory_order_relaxed); + stats.delete_length = 0; + stats.mrr_length_per_rec = ref_length + sizeof(uint64_t); + } + + /* HA_STATUS_TIME -- we create_time from .frm stat and update_time from last DML */ + if ((flag & HA_STATUS_TIME) && share) + { + stats.create_time = share->create_time; + stats.update_time = share->update_time.load(std::memory_order_relaxed); + } + + /* HA_STATUS_CONST -- set rec_per_key for index selectivity estimates. + PK and UNIQUE indexes -- rec_per_key = 1. + Non-unique secondary indexes -- use cached_rec_per_key if populated + by ANALYZE TABLE, else use a heuristic + (total_keys / STATS_REC_PER_KEY_FALLBACK_DIVISOR). */ + if ((flag & HA_STATUS_CONST) && share) + { + for (uint i = 0; i < table->s->keys; i++) + { + KEY *key = &table->key_info[i]; + bool is_pk = share->has_user_pk && i == share->pk_index; + bool is_unique = (key->flags & HA_NOSAME); + ulong cached_rpk = + (i < MAX_KEY) ? share->cached_rec_per_key[i].load(std::memory_order_relaxed) : 0; + for (uint j = 0; j < key->ext_key_parts; j++) + { + if (is_pk || is_unique) + { + if (j + 1 >= key->user_defined_key_parts) + { + /* Full unique key, exactly 1 row per distinct value */ + key->rec_per_key[j] = REC_PER_KEY_UNIQUE; + } + else + { + /* Intermediate prefix of a composite unique key. + Estimate assuming uniform distribution: + cardinality(prefix_k) ≈ total^(k/N) + rec_per_key[j] = total^((N - j - 1) / N) + E.g. for PK(a,b,c) with 300K rows: + rec_per_key[0] ≈ 4481 (per distinct a) + rec_per_key[1] ≈ 67 (per distinct a,b) + rec_per_key[2] = 1 (unique) */ + uint N = key->user_defined_key_parts; + double rpk = pow((double)stats.records, (double)(N - j - 1) / (double)N); + key->rec_per_key[j] = (ulong)MY_MAX((ulong)rpk, REC_PER_KEY_FLOOR); + } + } + else if (j + 1 == key->user_defined_key_parts) + { + /* Last user key part of a non-unique index. + We use ANALYZE-sampled value if available, else heuristic. */ + if (cached_rpk > 0) + key->rec_per_key[j] = cached_rpk; + else + key->rec_per_key[j] = + (ulong)MY_MAX(stats.records / STATS_REC_PER_KEY_FALLBACK_DIVISOR + 1, + REC_PER_KEY_FLOOR); + } + else + { + /* Intermediate prefix of a non-unique index. + Geometrically interpolate between stats.records + (single leading column) and the last-part rec_per_key. + Formula is total / (total/last_rpk)^((j+1)/N) */ + ulong last_rpk = + (cached_rpk > 0) + ? cached_rpk + : (ulong)(stats.records / STATS_REC_PER_KEY_FALLBACK_DIVISOR + 1); + uint N = key->user_defined_key_parts; + double base = (last_rpk > 0) ? (double)stats.records / (double)last_rpk + : (double)stats.records; + double rpk = (double)stats.records / pow(base, (double)(j + 1) / (double)N); + key->rec_per_key[j] = + (ulong)MY_MAX(MY_MIN((ulong)rpk, stats.records), REC_PER_KEY_FLOOR); + } + } + } + } + + DBUG_RETURN(0); +} + +/* ******************** analyze ******************** */ + +/* + ANALYZE TABLE -- refresh cached stats and output CF statistics as notes. + The notes appear as additional Msg_type='note' rows in the ANALYZE TABLE + result set, giving the user visibility into TidesDB internals. +*/ +int ha_tidesdb::analyze(THD *thd, HA_CHECK_OPT *check_opt) +{ + DBUG_ENTER("ha_tidesdb::analyze"); + + if (!share || !share->cf) DBUG_RETURN(HA_ADMIN_FAILED); + + share->stats_refresh_us.store(0, std::memory_order_relaxed); + info(HA_STATUS_VARIABLE | HA_STATUS_CONST); + + tidesdb_stats_t *st = NULL; + if (tidesdb_get_stats(share->cf, &st) != TDB_SUCCESS || !st) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] unable to retrieve column family stats"); + DBUG_RETURN(HA_ADMIN_OK); + } + + /* Summary line */ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] CF '%s' total_keys=%llu data_size=%llu bytes" + " memtable=%zu bytes levels=%d read_amp=%.2f" + " cache_hit=%.1f%%", + share->cf_name.c_str(), (unsigned long long)st->total_keys, + (unsigned long long)st->total_data_size, st->memtable_size, st->num_levels, + st->read_amp, st->hit_rate * PERCENT_SCALE); + + /* Average sizes */ + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] avg_key=%.1f bytes avg_value=%.1f bytes", st->avg_key_size, + st->avg_value_size); + + /* Per-level detail */ + for (int i = 0; i < st->num_levels; i++) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] level %d sstables=%d size=%zu bytes" + " keys=%llu", + i + 1, st->level_num_sstables[i], st->level_sizes[i], + (unsigned long long)st->level_key_counts[i]); + } + + /* B+tree stats (only when use_btree=1) */ + if (st->use_btree) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] btree nodes=%llu max_height=%u" + " avg_height=%.2f", + (unsigned long long)st->btree_total_nodes, st->btree_max_height, + st->btree_avg_height); + } + + tidesdb_free_stats(st); + + /* Secondary index CF stats + cardinality sampling. + We iterate each secondary index CF, counting distinct index-column + prefixes (everything before the PK suffix) to compute rec_per_key. */ + { + int erc = ensure_stmt_txn(); + if (erc) + { + DBUG_RETURN(HA_ADMIN_OK); /* non-fatal -- stats just won't be updated */ + } + } + for (uint i = 0; i < table->s->keys; i++) + { + if (share->has_user_pk && i == share->pk_index) continue; + if (i >= share->idx_cfs.size() || !share->idx_cfs[i]) continue; + KEY *ki = &table->key_info[i]; + + tidesdb_stats_t *ist = NULL; + uint64_t idx_total_keys = 0; + if (tidesdb_get_stats(share->idx_cfs[i], &ist) == TDB_SUCCESS && ist) + { + idx_total_keys = ist->total_keys; + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] idx CF '%s' keys=%llu data_size=%llu bytes" + " levels=%d", + share->idx_cf_names[i].c_str(), (unsigned long long)ist->total_keys, + (unsigned long long)ist->total_data_size, ist->num_levels); + tidesdb_free_stats(ist); + } + + /* We sample the index to estimate distinct prefix count. + For unique indexes rec_per_key is always 1. + For non-unique indexes, scan up to ANALYZE_SAMPLE_LIMIT entries + and count distinct index-column prefixes. */ + if (ki->flags & HA_NOSAME) + { + share->cached_rec_per_key[i].store(REC_PER_KEY_UNIQUE, std::memory_order_relaxed); + continue; + } + + uint idx_prefix_len = share->idx_comp_key_len[i]; + if (idx_prefix_len == 0) continue; + + tidesdb_iter_t *ait = NULL; + if (tdb_iter_new_blocking(ha_thd(), stmt_txn, share->idx_cfs[i], &ait) != TDB_SUCCESS || + !ait) + continue; + + tidesdb_iter_seek_to_first(ait); + + static constexpr uint64_t ANALYZE_SAMPLE_LIMIT = 100000; + uint64_t sampled = 0, distinct = 0; + uchar prev_prefix[MAX_KEY_LENGTH]; + uint prev_len = 0; + + while (tidesdb_iter_valid(ait) && sampled < ANALYZE_SAMPLE_LIMIT) + { + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(ait, &ik, &iks) != TDB_SUCCESS) break; + + uint cmp_len = (iks >= idx_prefix_len) ? idx_prefix_len : (uint)iks; + if (sampled == 0 || cmp_len != prev_len || memcmp(ik, prev_prefix, cmp_len) != 0) + { + distinct++; + prev_len = cmp_len; + memcpy(prev_prefix, ik, cmp_len); + } + sampled++; + tidesdb_iter_next(ait); + } + tidesdb_iter_free(ait); + + if (distinct > 0) + { + uint64_t total = (idx_total_keys > 0) ? idx_total_keys : sampled; + if (sampled < total) + { + /* Extrapolate -- distinct_full ≈ distinct * (total / sampled) */ + double ratio = (double)total / (double)sampled; + uint64_t est_distinct = (uint64_t)(distinct * ratio); + if (est_distinct == 0) est_distinct = 1; /* divide-by-zero guard */ + ulong rpk = (ulong)(total / est_distinct); + if (rpk == 0) rpk = REC_PER_KEY_FLOOR; + share->cached_rec_per_key[i].store(rpk, std::memory_order_relaxed); + } + else + { + ulong rpk = (ulong)(sampled / distinct); + if (rpk == 0) rpk = REC_PER_KEY_FLOOR; + share->cached_rec_per_key[i].store(rpk, std::memory_order_relaxed); + } + + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, ER_UNKNOWN_ERROR, + "[TIDESDB] idx '%s' sampled=%llu distinct=%llu rec_per_key=%lu", + ki->name.str, (unsigned long long)sampled, + (unsigned long long)distinct, + share->cached_rec_per_key[i].load(std::memory_order_relaxed)); + } + } + + info(HA_STATUS_CONST); + + DBUG_RETURN(HA_ADMIN_OK); +} + +/* ******************** optimize ******************** */ + +/* + OPTIMIZE TABLE -- trigger compaction on all CFs (data + secondary indexes). + Compaction merges SSTables, removes tombstones, and reduces read + amplification. TidesDB enqueues the work to background compaction + threads and returns immediately. +*/ +int ha_tidesdb::optimize(THD *thd, HA_CHECK_OPT *check_opt) +{ + DBUG_ENTER("ha_tidesdb::optimize"); + + if (!share || !share->cf) DBUG_RETURN(HA_ADMIN_FAILED); + + /* tidesdb_purge_cf() is synchronous -- flushes memtable to disk, then + runs a full compaction inline, blocking until complete. This is + the right semantic for OPTIMIZE TABLE -- the caller expects the + table to be fully compacted when the statement returns. */ + bool any_locked = false; + int rc = tidesdb_purge_cf(share->cf); + if (rc == TDB_ERR_LOCKED) + any_locked = true; + else if (rc != TDB_SUCCESS) + sql_print_warning("[TIDESDB] optimize: purge data CF '%s' failed (err=%d)", + share->cf_name.c_str(), rc); + + for (uint i = 0; i < share->idx_cfs.size(); i++) + { + if (!share->idx_cfs[i]) continue; + rc = tidesdb_purge_cf(share->idx_cfs[i]); + if (rc == TDB_ERR_LOCKED) + any_locked = true; + else if (rc != TDB_SUCCESS) + sql_print_warning("[TIDESDB] optimize: purge idx CF '%s' failed (err=%d)", + share->idx_cf_names[i].c_str(), rc); + } + + share->stats_refresh_us.store(0, std::memory_order_relaxed); + + /* TDB_ERR_LOCKED means "another compaction was already running and + will subsume this work". Surface HA_ADMIN_TRY_ALTER so the user + sees something other than silent success -- they can retry, or + confirm via SHOW ENGINE TIDESDB STATUS that compaction finished. */ + if (any_locked) + { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_NOTE, HA_ADMIN_TRY_ALTER, + "OPTIMIZE TABLE: one or more column families had a " + "compaction already in flight; retry shortly if you " + "need the post-OPTIMIZE state."); + DBUG_RETURN(HA_ADMIN_TRY_ALTER); + } + DBUG_RETURN(HA_ADMIN_OK); +} + +int ha_tidesdb::check(THD *thd, HA_CHECK_OPT *check_opt) +{ + DBUG_ENTER("ha_tidesdb::check"); + + if (!share || !share->cf) DBUG_RETURN(HA_ADMIN_CORRUPT); + + /* CHECK TABLE verifies all CFs are readable by fetching stats. + tidesdb_get_stats reads metadata from all SSTables, which validates + that manifests, block indexes, bloom filters, and metadata blocks + are intact. For a deeper check, users can run REPAIR TABLE which + does a full compaction pass that reads and re-checksums every block. */ + tidesdb_stats_t *st = NULL; + int rc = tidesdb_get_stats(share->cf, &st); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] CHECK TABLE '%s': data CF check failed (err=%d)", + share->cf_name.c_str(), rc); + DBUG_RETURN(HA_ADMIN_CORRUPT); + } + tidesdb_free_stats(st); + + for (uint i = 0; i < share->idx_cfs.size(); i++) + { + if (!share->idx_cfs[i]) continue; + tidesdb_stats_t *ist = NULL; + rc = tidesdb_get_stats(share->idx_cfs[i], &ist); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] CHECK TABLE '%s': index CF '%s' check failed (err=%d)", + share->cf_name.c_str(), share->idx_cf_names[i].c_str(), rc); + DBUG_RETURN(HA_ADMIN_CORRUPT); + } + tidesdb_free_stats(ist); + } + + DBUG_RETURN(HA_ADMIN_OK); +} + +int ha_tidesdb::repair(THD *thd, HA_CHECK_OPT *check_opt) +{ + DBUG_ENTER("ha_tidesdb::repair"); + + if (!share || !share->cf) DBUG_RETURN(HA_ADMIN_FAILED); + + /* REPAIR TABLE triggers a full purge (flush + compaction) of all CFs. + In unified memtable mode, the first purge_cf call rotates the shared + unified memtable and waits for the flush to complete. Subsequent + purge_cf calls on index CFs skip the rotation (already done) and + just run per-CF compaction. tidesdb_purge_cf is unified-mode aware + and handles this idempotently. */ + int rc = tidesdb_purge_cf(share->cf); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] REPAIR TABLE '%s': purge data CF failed (err=%d)", + share->cf_name.c_str(), rc); + DBUG_RETURN(HA_ADMIN_FAILED); + } + + for (uint i = 0; i < share->idx_cfs.size(); i++) + { + if (!share->idx_cfs[i]) continue; + rc = tidesdb_purge_cf(share->idx_cfs[i]); + if (rc != TDB_SUCCESS) + sql_print_warning("[TIDESDB] REPAIR TABLE '%s': purge idx CF '%s' failed (err=%d)", + share->cf_name.c_str(), share->idx_cf_names[i].c_str(), rc); + } + + share->stats_refresh_us.store(0, std::memory_order_relaxed); + DBUG_RETURN(HA_ADMIN_OK); +} + +IO_AND_CPU_COST ha_tidesdb::scan_time() +{ + IO_AND_CPU_COST cost; + cost.io = 0.0; + cost.cpu = 0.0; + + if (!share || !share->cf) return cost; + + /* Cache the range_cost result on the share with the same refresh + interval as stats (TIDESDB_STATS_REFRESH_US = 2 seconds). + tidesdb_range_cost examines in-memory metadata (block indexes, + SSTable min/max keys) without disk I/O, but the computation + still has measurable CPU cost when called per query plan. */ + auto now = std::chrono::steady_clock::now(); + auto cached_time = share->scan_cost_time.load(std::memory_order_relaxed); + double cached_cost = share->cached_scan_cost.load(std::memory_order_relaxed); + + bool stale = + (cached_cost <= 0.0) || + (std::chrono::duration_cast(now.time_since_epoch()).count() - + cached_time > + TIDESDB_STATS_REFRESH_US); + + if (stale) + { + uchar lo[KEY_NAMESPACE_LEN] = {KEY_NS_DATA}; + uchar hi[DATA_KEY_BUF_LEN]; + memset(hi, KEY_INF_HI_BYTE, sizeof(hi)); + uint hi_len = KEY_NAMESPACE_LEN + share->pk_key_len; + if (hi_len > sizeof(hi)) hi_len = sizeof(hi); + + double full_cost = 0.0; + if (tidesdb_range_cost(share->cf, lo, KEY_NAMESPACE_LEN, hi, hi_len, &full_cost) == + TDB_SUCCESS && + full_cost > 0.0) + { + cached_cost = full_cost; + share->cached_scan_cost.store(cached_cost, std::memory_order_relaxed); + share->scan_cost_time.store( + std::chrono::duration_cast(now.time_since_epoch()) + .count(), + std::memory_order_relaxed); + } + } + + if (cached_cost > 0.0) + { + cost.io = cached_cost * TIDESDB_SCAN_IO_WEIGHT; + cost.cpu = cached_cost * TIDESDB_SCAN_CPU_WEIGHT; + } + else + { + cost = handler::scan_time(); + } + + return cost; +} + +ha_rows ha_tidesdb::records_in_range(uint inx, const key_range *min_key, const key_range *max_key, + page_range *pages) +{ + if (!share) return TIDESDB_RIR_DEFAULT_EST; + + ha_rows total = share->cached_records.load(std::memory_order_relaxed); + if (total == 0) total = TIDESDB_MIN_STATS_RECORDS; + + tidesdb_column_family_t *cf; + bool is_pk = share->has_user_pk && inx == share->pk_index; + if (is_pk) + cf = share->cf; + else if (inx < share->idx_cfs.size() && share->idx_cfs[inx]) + cf = share->idx_cfs[inx]; + else + return (total / TIDESDB_RIR_UNKNOWN_DENOM) + REC_PER_KEY_FLOOR; /* no CF for this index */ + + /* We convert min_key / max_key to our comparable format. + If a bound is missing we use the natural boundary of the key space. */ + uchar lo_buf[DATA_KEY_BUF_LEN]; + uchar hi_buf[DATA_KEY_BUF_LEN]; + uint lo_len = 0, hi_len = 0; + + MY_BITMAP *old_map = tmp_use_all_columns(table, &table->read_set); + + if (min_key && min_key->key) + { + KEY *ki = &table->key_info[inx]; + uint kl = calculate_key_len(table, inx, min_key->key, min_key->keypart_map); + if (is_pk) + { + uchar comp[MAX_KEY_LENGTH]; + uint comp_len = key_copy_to_comparable(ki, min_key->key, kl, comp); + lo_len = build_data_key(comp, comp_len, lo_buf); + } + else + { + lo_len = key_copy_to_comparable(ki, min_key->key, kl, lo_buf); + } + } + else + { + /* No lower bound, we use smallest possible key */ + if (is_pk) + { + lo_buf[0] = KEY_NS_DATA; + lo_len = KEY_NAMESPACE_LEN; + } + else + { + lo_buf[0] = KEY_INF_LO_BYTE; + lo_len = KEY_NAMESPACE_LEN; + } + } + + if (max_key && max_key->key) + { + KEY *ki = &table->key_info[inx]; + uint kl = calculate_key_len(table, inx, max_key->key, max_key->keypart_map); + if (is_pk) + { + uchar comp[MAX_KEY_LENGTH]; + uint comp_len = key_copy_to_comparable(ki, max_key->key, kl, comp); + hi_len = build_data_key(comp, comp_len, hi_buf); + } + else + { + hi_len = key_copy_to_comparable(ki, max_key->key, kl, hi_buf); + } + } + else + { + /* No upper bound, we use largest possible key */ + memset(hi_buf, KEY_INF_HI_BYTE, sizeof(hi_buf)); + hi_len = is_pk ? (KEY_NAMESPACE_LEN + share->pk_key_len) + : share->idx_comp_key_len[inx] + share->pk_key_len; + if (hi_len > sizeof(hi_buf)) hi_len = sizeof(hi_buf); + } + + tmp_restore_column_map(&table->read_set, old_map); + + /* We detect point equality, both bounds provided with identical comparable + bytes. tidesdb_range_cost is an I/O cost metric, not a cardinality + metric -- for memtable-only data it cannot distinguish a point range + from a full scan. For equalities we return rec_per_key directly. */ + if (min_key && max_key && lo_len > 0 && hi_len > 0 && lo_len == hi_len && + memcmp(lo_buf, hi_buf, lo_len) == 0) + { + KEY *ki = &table->key_info[inx]; + uint parts_used = my_count_bits(min_key->keypart_map); + if (parts_used > 0 && parts_used <= ki->user_defined_key_parts) + { + ulong rpk = ki->rec_per_key[parts_used - 1]; + ha_rows est = (rpk > 0) ? (ha_rows)rpk : REC_PER_KEY_FLOOR; + if (est > total) est = total; + return est; + } + return REC_PER_KEY_FLOOR; + } + + /* We ask TidesDB for the range cost (no disk I/O -- uses in-memory + block indexes, SSTable min/max keys, and entry counts). */ + double range_cost = 0.0; + int rc = tidesdb_range_cost(cf, lo_buf, lo_len, hi_buf, hi_len, &range_cost); + if (rc != TDB_SUCCESS || range_cost <= 0.0) + return (total / TIDESDB_RIR_UNKNOWN_DENOM) + REC_PER_KEY_FLOOR; /* fallback */ + + /* We get full-range cost for normalization. We use the natural boundaries + of the key space so that range_cost / full_cost ≈ fraction of data. + Cached per-CF and refreshed on the same TIDESDB_STATS_REFRESH_US + window as scan_time so a plan probing N alternatives only computes + the normalizer once. */ + double full_cost = 0.0; + { + std::atomic *cache_val = + is_pk ? &share->cached_pk_full_cost + : (inx < share->cached_idx_full_cost_n ? &share->cached_idx_full_cost[inx] + : nullptr); + std::atomic *cache_time = + is_pk ? &share->cached_pk_full_cost_time + : (inx < share->cached_idx_full_cost_n ? &share->cached_idx_full_cost_time[inx] + : nullptr); + + long long now_us = std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); + + if (cache_val && cache_time) + { + double cached = cache_val->load(std::memory_order_relaxed); + long long when = cache_time->load(std::memory_order_relaxed); + if (cached > 0.0 && now_us - when < TIDESDB_STATS_REFRESH_US) + { + full_cost = cached; + } + } + + if (full_cost <= 0.0) + { + uchar full_lo[KEY_NAMESPACE_LEN] = {(uchar)(is_pk ? KEY_NS_DATA : KEY_INF_LO_BYTE)}; + uchar full_hi[DATA_KEY_BUF_LEN]; + memset(full_hi, KEY_INF_HI_BYTE, sizeof(full_hi)); + uint full_hi_len = hi_len; /* same width as hi_buf */ + tidesdb_range_cost(cf, full_lo, KEY_NAMESPACE_LEN, full_hi, full_hi_len, &full_cost); + + if (cache_val && cache_time && full_cost > 0.0) + { + cache_val->store(full_cost, std::memory_order_relaxed); + cache_time->store(now_us, std::memory_order_relaxed); + } + } + } + + if (full_cost <= 0.0) + return (total / TIDESDB_RIR_UNKNOWN_DENOM) + REC_PER_KEY_FLOOR; /* fallback */ + + /* We estimate records proportionally -- narrower range -> fewer records */ + double fraction = range_cost / full_cost; + if (fraction > FRACTION_MAX) fraction = FRACTION_MAX; + if (fraction < FRACTION_MIN) fraction = FRACTION_MIN; + + ha_rows est = (ha_rows)(total * fraction); + if (est == 0) est = REC_PER_KEY_FLOOR; /* never return 0 -- optimizer treats it as "empty" */ + + /* When both bounds are provided but the estimated fraction is very + high (>TIDESDB_RIR_FRACTION_UNRELIABLE), tidesdb_range_cost is + likely unreliable -- this happens with memtable-only data where + the cost function cannot distinguish a narrow range from a full + scan. Fall back to a rec_per_key-based estimate for the prefix. */ + if (min_key && max_key && fraction > TIDESDB_RIR_FRACTION_UNRELIABLE) + { + KEY *ki = &table->key_info[inx]; + uint parts = my_count_bits(min_key->keypart_map); + if (parts > 0 && parts <= ki->user_defined_key_parts) + { + ulong rpk = ki->rec_per_key[parts - 1]; + if (rpk > 0) + { + ha_rows capped; + if (lo_len == hi_len && memcmp(lo_buf, hi_buf, lo_len) == 0) + { + /* Point equality, we use rec_per_key directly */ + capped = (ha_rows)rpk; + } + else + { + /* With range scans we multiply rec_per_key by a conservative + range-width factor. Typical OLTP ranges span tens of + key values; the multiplier keeps the estimate tight while + still being vastly better than the unreliable full ratio. */ + capped = (ha_rows)rpk * TIDESDB_RIR_RANGE_RPK_MULTIPLIER; + const ha_rows cap = total / TIDESDB_RIR_RANGE_CAP_DENOM; + if (capped > cap) capped = cap; + } + if (capped < est) est = MY_MAX(capped, REC_PER_KEY_FLOOR); + } + } + } + + return est; +} + +ulong ha_tidesdb::index_flags(uint idx, uint part, bool all_parts) const +{ + /* FULLTEXT indexes do not support ordered reads or ICP */ + if (table_share && idx < table_share->keys && + table_share->key_info[idx].algorithm == HA_KEY_ALG_FULLTEXT) + return 0; + + /* SPATIAL indexes support MBR range scans and forward iteration */ + if (table_share && idx < table_share->keys && is_spatial_index(&table_share->key_info[idx])) + return HA_READ_NEXT | HA_READ_RANGE; + + ulong flags = + HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | HA_READ_RANGE | HA_DO_INDEX_COND_PUSHDOWN; + if (table_share && table_share->primary_key != MAX_KEY && idx == table_share->primary_key) + flags |= HA_CLUSTERED_INDEX; + else + flags |= HA_KEYREAD_ONLY; + return flags; +} + +const char *ha_tidesdb::index_type(uint key_number) +{ + if (key_number < table->s->keys) + { + if (table->key_info[key_number].algorithm == HA_KEY_ALG_FULLTEXT) return "FULLTEXT"; + if (is_spatial_index(&table->key_info[key_number])) return "RTREE"; + ha_index_option_struct *iopts = table->key_info[key_number].option_struct; + if (iopts && iopts->use_btree) return "BTREE"; + } + ha_table_option_struct *opts = TDB_TABLE_OPTIONS(table); + return (opts && opts->use_btree) ? "BTREE" : "LSM"; +} + +/* ******************** Spatial scan continuation ******************** */ + +int ha_tidesdb::spatial_scan_next(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::spatial_scan_next"); + + tdb_mbr_t query_mbr; + query_mbr.xmin = spatial_qmbr_[MBR_XMIN_IDX]; + query_mbr.ymin = spatial_qmbr_[MBR_YMIN_IDX]; + query_mbr.xmax = spatial_qmbr_[MBR_XMAX_IDX]; + query_mbr.ymax = spatial_qmbr_[MBR_YMAX_IDX]; + + while (spatial_range_idx_ < spatial_ranges_.size()) + { + uint64_t cur_hi = spatial_ranges_[spatial_range_idx_].second; + + while (tidesdb_iter_valid(scan_iter)) + { + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + uint8_t *ik = NULL; + size_t iks = 0; + if (tidesdb_iter_key(scan_iter, &ik, &iks) != TDB_SUCCESS) break; + + if (iks <= SPATIAL_HILBERT_KEY_LEN) + { + tidesdb_iter_next(scan_iter); + continue; + } + + uint64_t h = decode_hilbert_be(ik); + if (h > cur_hi) break; /* advance to next range */ + + uint8_t *val = NULL; + size_t vlen = 0; + if (tidesdb_iter_value(scan_iter, &val, &vlen) != TDB_SUCCESS || + vlen < SPATIAL_MBR_VALUE_LEN) + { + tidesdb_iter_next(scan_iter); + continue; + } + + /* The on-disk spatial value is exactly SPATIAL_MBR_VALUE_LEN bytes + laid out as [xmin,ymin,xmax,ymax] (4 doubles in native order), + matching tdb_mbr_t's field order. We assert the struct size + against the wire size so adding a field to tdb_mbr_t will + fire the static_assert rather than silently corrupt reads. */ + static_assert(sizeof(tdb_mbr_t) == SPATIAL_MBR_VALUE_LEN, + "tdb_mbr_t must match on-disk spatial value layout"); + tdb_mbr_t entry_mbr; + memcpy(&entry_mbr, val, SPATIAL_MBR_VALUE_LEN); + + /* We apply MBR predicate */ + if (!spatial_mbr_predicate(spatial_mode_, &query_mbr, &entry_mbr)) + { + tidesdb_iter_next(scan_iter); + continue; + } + + /* A match, we extract PK from key suffix and fetch full row */ + const uchar *pk = ik + SPATIAL_HILBERT_KEY_LEN; + uint pk_len = (uint)(iks - SPATIAL_HILBERT_KEY_LEN); + + int ret = fetch_row_by_pk(scan_txn, pk, pk_len, buf); + if (ret == HA_ERR_KEY_NOT_FOUND) + { + tidesdb_iter_next(scan_iter); + continue; + } + if (ret) + { + table->status = STATUS_NOT_FOUND; + DBUG_RETURN(ret); + } + + scan_dir_ = DIR_FORWARD; + table->status = 0; + DBUG_RETURN(0); + } + + /* The current range exhausted, thus we advance to next range and seek */ + spatial_range_idx_++; + if (spatial_range_idx_ < spatial_ranges_.size()) + { + uchar seek_key[SPATIAL_HILBERT_KEY_LEN]; + encode_hilbert_be(spatial_ranges_[spatial_range_idx_].first, seek_key); + tidesdb_iter_seek(scan_iter, seek_key, SPATIAL_HILBERT_KEY_LEN); + } + } + + table->status = STATUS_NOT_FOUND; + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + +/* ******************** Full-Text Search methods ******************** */ + +int ha_tidesdb::ft_init() +{ + DBUG_ENTER("ha_tidesdb::ft_init"); + if (ft_handler) + { + tdb_ft_info_t *info = reinterpret_cast(ft_handler); + info->current_idx = 0; + } + DBUG_RETURN(0); +} + +void ha_tidesdb::ft_end() +{ + DBUG_ENTER("ha_tidesdb::ft_end"); + DBUG_VOID_RETURN; +} + +FT_INFO *ha_tidesdb::ft_init_ext(uint flags, uint inx, String *key) +{ + DBUG_ENTER("ha_tidesdb::ft_init_ext"); + + if (!share || inx >= share->idx_cfs.size() || !share->idx_cfs[inx] || + !is_fts_index(&table->key_info[inx])) + DBUG_RETURN(NULL); + + { + int erc = ensure_stmt_txn(); + if (erc) DBUG_RETURN(NULL); + } + + CHARSET_INFO *cs = table->key_info[inx].key_part[0].field->charset(); + + std::vector query_terms; + if (flags & FT_BOOL) + { + fts_parse_boolean(key->ptr(), key->length(), cs, query_terms); + } + else + { + std::vector tokens; + fts_tokenize(key->ptr(), key->length(), cs, tokens); + for (auto &tok : tokens) + { + fts_query_term_t qt; + qt.term = std::move(tok.word); + qt.yesno = FTS_TERM_NEUTRAL; + qt.trunc = false; + qt.is_phrase = false; + query_terms.push_back(std::move(qt)); + } + } + + /* A query that tokenises down to nothing (e.g. all stop words, all + characters below the min-word-len threshold) still has to return a + usable FT_INFO, not NULL. The server's execution path assumes an + FT_INFO was handed back and leaves the diagnostics area in an + inconsistent state when ft_init_ext yields NULL for reasons other + than an outright error; debug builds trip Protocol::end_statement's + DBUG_ASSERT(0). We return an empty result set instead, which the + optimizer folds into zero matched rows. */ + if (query_terms.empty()) + { + tdb_ft_info_t *empty = new tdb_ft_info_t(); + empty->please = const_cast<_ft_vft *>(&tdb_ft_vft); + empty->could_you = &tdb_ft_vft_ext; + empty->handler = this; + empty->keynr = inx; + empty->current_idx = 0; + empty->current_rank = 0.0f; + empty->match_count = 0; + DBUG_RETURN(reinterpret_cast(empty)); + } + + int64_t total_docs = 0, total_words = 0; + fts_load_meta(stmt_txn, share->cf, inx, &total_docs, &total_words); + double avgdl = total_docs > 0 ? (double)total_words / (double)total_docs : BM25_DEFAULT_AVGDL; + if (total_docs == 0) total_docs = BM25_MIN_TOTAL_DOCS; /* avoid division by zero */ + /* We precompute 1/avgdl so the per-posting BM25 loop multiplies instead of + dividing. Divisions are expensive on modern CPUs (~20 cycles vs ~5 + for a multiply) and this runs per term per matched document. */ + const double inv_avgdl = 1.0 / avgdl; + + /* For each query term, prefix-scan the FTS CF to gather postings and score */ + std::unordered_map doc_scores; + std::unordered_map doc_required_hits; + uint num_required = 0; + + /* Open one iterator over the FTS CF and reuse it across every query + term -- iterator construction does an O(num_sstables) merge-heap + build, so doing it per term made the heap work scale linearly with + term count. All terms in this MATCH AGAINST live in the same + index, so a single iterator is reseeked for each term. */ + tidesdb_iter_t *shared_it = NULL; + { + int sirc = tdb_iter_new_blocking(ha_thd(), stmt_txn, share->idx_cfs[inx], &shared_it); + if (sirc != TDB_SUCCESS) shared_it = NULL; + } + + for (auto &qt : query_terms) + { + if (qt.yesno > FTS_TERM_NEUTRAL) num_required++; + + /* fts_build_key truncates inserted terms to FTS_MAX_TERM_BYTES, so on-disk + keys never carry more than that. The query term lives in a std::string + that has no such cap (a 512-character CJK token packs ~1.5 KB of UTF-8), + and copying it raw into the 514-byte stack prefix would overrun. */ + size_t qlen = qt.term.size(); + if (qlen > FTS_MAX_TERM_BYTES) qlen = FTS_MAX_TERM_BYTES; + + uchar prefix[FTS_TERM_LEN_PREFIX + FTS_MAX_TERM_BYTES]; + uint prefix_len = 0; + int2store(prefix, (uint16)qlen); + prefix_len += FTS_TERM_LEN_PREFIX; + memcpy(prefix + prefix_len, qt.term.data(), qlen); + prefix_len += (uint)qlen; + + struct posting_entry + { + std::string pk; + uint16 tf; + uint32 doc_len; + }; + std::vector postings; + + if (!shared_it) continue; + tidesdb_iter_t *it = shared_it; + + if (qt.trunc) + { + /* Wildcard search keys are sorted by [2B term_len][term][pk], + so terms of different lengths are in different regions. + We iterate over each possible term length from the prefix + length up to max_word_len, seeking directly to [len][prefix] + for each bucket. This is O(max_word_len) seeks, each precise. */ + uint min_len = (uint)qlen; + uint max_len = (uint)srv_fts_max_word_len; + if (max_len > FTS_MAX_TERM_BYTES) max_len = FTS_MAX_TERM_BYTES; + + for (uint tlen = min_len; tlen <= max_len; tlen++) + { + uchar seek[FTS_TERM_LEN_PREFIX + FTS_MAX_TERM_BYTES]; + int2store(seek, (uint16)tlen); + memcpy(seek + FTS_TERM_LEN_PREFIX, qt.term.data(), qlen); + uint seek_len = FTS_TERM_LEN_PREFIX + (uint)qlen; + + tidesdb_iter_seek(it, seek, seek_len); + while (tidesdb_iter_valid(it)) + { + uint8_t *ik = NULL; + size_t iks = 0; + uint8_t *iv = NULL; + size_t ivs = 0; + if (tidesdb_iter_key_value(it, &ik, &iks, &iv, &ivs) != TDB_SUCCESS) break; + + if (iks < FTS_TERM_LEN_PREFIX) break; + uint16 stored_len = uint2korr(ik); + if (stored_len != tlen) break; + + if (iks < (size_t)(FTS_TERM_LEN_PREFIX + stored_len)) break; + if (memcmp(ik + FTS_TERM_LEN_PREFIX, qt.term.data(), qlen) != 0) break; + + uint pk_off = FTS_TERM_LEN_PREFIX + stored_len; + if (iks <= pk_off) + { + tidesdb_iter_next(it); + continue; + } + std::string pk((char *)(ik + pk_off), iks - pk_off); + + if (ivs >= FTS_VALUE_LEN) + postings.push_back({pk, (uint16)uint2korr(iv), + (uint32)uint4korr(iv + FTS_VALUE_DOC_LEN_OFFSET)}); + + tidesdb_iter_next(it); + } + } + } + else + { + tidesdb_iter_seek(it, prefix, prefix_len); + /* exact-match path (non-truncated) */ + while (tidesdb_iter_valid(it)) + { + uint8_t *ik = NULL; + size_t iks = 0; + uint8_t *iv = NULL; + size_t ivs = 0; + if (tidesdb_iter_key_value(it, &ik, &iks, &iv, &ivs) != TDB_SUCCESS) break; + + if (iks < prefix_len || memcmp(ik, prefix, prefix_len) != 0) break; + std::string pk((char *)(ik + prefix_len), iks - prefix_len); + + if (ivs >= FTS_VALUE_LEN) + postings.push_back({pk, (uint16)uint2korr(iv), + (uint32)uint4korr(iv + FTS_VALUE_DOC_LEN_OFFSET)}); + tidesdb_iter_next(it); + } + } + + uint32 df = (uint32)postings.size(); + double idf = std::log(((double)total_docs - (double)df + BM25_IDF_EPSILON) / + ((double)df + BM25_IDF_EPSILON) + + BM25_IDF_NONNEG_SHIFT); + const double k1 = srv_fts_bm25_k1, b = srv_fts_bm25_b; + + /* Pre-fold per-term constants so the inner loop is one MAD + one + divide + one multiply, instead of recomputing k1*(1-b) and + k1*b*inv_avgdl on every posting. */ + const double idf_x_k1_plus_1 = idf * (k1 + BM25_TF_SATURATION_BOOST); + const double k1_one_minus_b = k1 * (BM25_LENGTH_NORM_BASE - b); + const double k1_b_inv_avgdl = k1 * b * inv_avgdl; + + /* Reserve approximate growth so the per-posting unordered_map + insert doesn't rehash; modest win on terms with many matches. */ + if (qt.yesno >= FTS_TERM_NEUTRAL) + { + doc_scores.reserve(doc_scores.size() + postings.size()); + if (qt.yesno > FTS_TERM_NEUTRAL) + doc_required_hits.reserve(doc_required_hits.size() + postings.size()); + } + + for (auto &p : postings) + { + double denom = (double)p.tf + k1_one_minus_b + k1_b_inv_avgdl * (double)p.doc_len; + double score = ((double)p.tf * idf_x_k1_plus_1) / denom; + + if (qt.yesno < FTS_TERM_NEUTRAL) + { + /* excluded term! we remove from results */ + doc_scores.erase(p.pk); + } + else + { + doc_scores[p.pk] += score; + if (qt.yesno > FTS_TERM_NEUTRAL) doc_required_hits[p.pk]++; + } + } + } + + if (shared_it) tidesdb_iter_free(shared_it); + + /* bool mode -- we filter docs that don't match all required terms */ + if (num_required > 0) + { + for (auto it = doc_scores.begin(); it != doc_scores.end();) + { + auto rh = doc_required_hits.find(it->first); + if (rh == doc_required_hits.end() || rh->second < num_required) + it = doc_scores.erase(it); + else + ++it; + } + } + + tdb_ft_info_t *info = new tdb_ft_info_t(); + info->please = const_cast<_ft_vft *>(&tdb_ft_vft); + info->could_you = &tdb_ft_vft_ext; + info->handler = this; + info->keynr = inx; + info->current_idx = 0; + info->current_rank = 0.0f; + info->match_count = 0; + + for (auto &kv : doc_scores) + { + const auto &pk_str = kv.first; + auto &score = kv.second; + tdb_fts_result_t r; + r.pk_len = (uint)pk_str.size(); + r.pk = (uchar *)my_malloc(PSI_NOT_INSTRUMENTED, r.pk_len, MYF(0)); + if (!r.pk) continue; + memcpy(r.pk, pk_str.data(), r.pk_len); + r.rank = (float)score; + info->results.push_back(r); + } + + /* For any phrase terms in the query, we fetch each + candidate row, re-tokenize the document, and check for the exact + phrase as a consecutive word sequence. We remove non-matching candidates. */ + bool has_phrases = false; + std::vector phrases; + for (auto &qt : query_terms) + { + if (qt.is_phrase) + { + has_phrases = true; + phrases.push_back(&qt); + } + } + + if (has_phrases && !info->results.empty()) + { + CHARSET_INFO *vcs = table->key_info[inx].key_part[0].field->charset(); + std::vector verified; + /* Tokenize each candidate once and check all phrases against the + single token vector, so an M-phrase query doesn't tokenize each + doc M times. */ + std::vector doc_tokens; + for (auto &r : info->results) + { + int err = fetch_row_by_pk(stmt_txn, r.pk, r.pk_len, table->record[0]); + if (err) continue; + + doc_tokens.clear(); + fts_extract_and_tokenize(table, &table->key_info[inx], table->record[0], vcs, + doc_tokens); + + bool all_phrases_match = true; + for (auto *ph : phrases) + { + if (!fts_phrase_in_tokens(doc_tokens, ph->phrase_words)) + { + all_phrases_match = false; + break; + } + } + + if (all_phrases_match) + verified.push_back(r); + else + my_free(r.pk); /* free PK of non-matching result */ + } + info->results = std::move(verified); + } + + std::sort(info->results.begin(), info->results.end(), + [](const tdb_fts_result_t &a, const tdb_fts_result_t &b) { return a.rank > b.rank; }); + + info->match_count = (ulonglong)info->results.size(); + + DBUG_RETURN(reinterpret_cast(info)); +} + +int ha_tidesdb::ft_read(uchar *buf) +{ + DBUG_ENTER("ha_tidesdb::ft_read"); + + if (cached_thd_ && thd_killed(cached_thd_)) DBUG_RETURN(HA_ERR_ABORTED_BY_USER); + + tdb_ft_info_t *info = reinterpret_cast(ft_handler); + if (!info) + { + table->status = STATUS_NOT_FOUND; + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + + { + int erc = ensure_stmt_txn(); + if (erc) DBUG_RETURN(erc); + } + + while (info->current_idx < info->results.size()) + { + tdb_fts_result_t &r = info->results[info->current_idx]; + info->current_rank = r.rank; + + int err = fetch_row_by_pk(stmt_txn, r.pk, r.pk_len, buf); + if (err == HA_ERR_KEY_NOT_FOUND) + { + info->current_idx++; + continue; /* skip stale entry */ + } + if (err) + { + table->status = STATUS_NOT_FOUND; + DBUG_RETURN(err); + } + + info->current_idx++; + table->status = 0; + DBUG_RETURN(0); + } + + table->status = STATUS_NOT_FOUND; + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + +int ha_tidesdb::extra(enum ha_extra_function operation) +{ + switch (operation) + { + case HA_EXTRA_KEYREAD: + keyread_only_ = true; + break; + case HA_EXTRA_NO_KEYREAD: + keyread_only_ = false; + break; + case HA_EXTRA_WRITE_CAN_REPLACE: + /* REPLACE INTO -- write_row may skip the dup-check and let + tidesdb_txn_put overwrite silently. Only safe when there + are no secondary indexes (otherwise old index entries must + still be cleaned up via delete+reinsert). */ + write_can_replace_ = true; + break; + case HA_EXTRA_INSERT_WITH_UPDATE: + /* INSERT ON DUPLICATE KEY UPDATE -- the server needs write_row + to return HA_ERR_FOUND_DUPP_KEY so it can switch to update_row. */ + break; + case HA_EXTRA_WRITE_CANNOT_REPLACE: + write_can_replace_ = false; + break; + case HA_EXTRA_PREPARE_FOR_DROP: + /* Table is about to be dropped -- skip fsync overhead */ + break; + default: + break; + } + return 0; +} + +/* ******************** Locking ******************** */ + +/* + Lazy txn creation. Gets the per-connection TidesDB txn (shared by + all handler objects on this connection). The txn spans the entire + BEGIN...COMMIT block, not just one statement. +*/ +int ha_tidesdb::ensure_stmt_txn() +{ + if (stmt_txn) return 0; + + THD *thd = cached_thd_ ? cached_thd_ : ha_thd(); + + /* Isolation resolution mirrors the external_lock path: + DDL -> READ_COMMITTED (avoids unbounded + read-set growth across a long scan). + autocommit single-stmt DML -> READ_COMMITTED (no concurrent + modification within the same txn). + multi-statement txn -> session isolation so write-write + conflict detection stays active. + Prefer the per-statement cache populated by external_lock; fall + back to the live THD call only when external_lock hasn't run yet + (e.g. some DDL callbacks). */ + int sql_cmd; + bool is_autocommit; + if (cached_stmt_shape_valid_) + { + sql_cmd = cached_sql_cmd_; + is_autocommit = cached_is_autocommit_; + } + else + { + sql_cmd = thd_sql_command(thd); + is_autocommit = !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN); + } + bool is_ddl = + (sql_cmd == SQLCOM_ALTER_TABLE || sql_cmd == SQLCOM_CREATE_INDEX || + sql_cmd == SQLCOM_DROP_INDEX || sql_cmd == SQLCOM_TRUNCATE || sql_cmd == SQLCOM_OPTIMIZE || + sql_cmd == SQLCOM_CREATE_TABLE || sql_cmd == SQLCOM_DROP_TABLE); + tidesdb_isolation_level_t effective_iso; + if (is_ddl || is_autocommit) + effective_iso = TDB_ISOLATION_READ_COMMITTED; + else + /* Honour session isolation regardless of pessimistic_locking. + Lock manager and library OCC compose -- the locks serialise + hot-row write contention, and OCC continues to enforce the + session's chosen isolation semantics (snapshot reads under + SNAPSHOT, read-set tracking under REPEATABLE_READ, full SSI + under SERIALIZABLE). Earlier revisions silently downgraded + to READ_COMMITTED here, which broke higher isolation levels + when pessimistic_locking was on. */ + effective_iso = resolve_effective_isolation( + thd, share ? share->isolation_level : TDB_ISOLATION_SNAPSHOT); + tidesdb_trx_t *trx = get_or_create_trx(thd, ht, effective_iso); + if (!trx) return HA_ERR_OUT_OF_MEM; + + stmt_txn = trx->txn; + return 0; +} + +int ha_tidesdb::external_lock(THD *thd, int lock_type) +{ + DBUG_ENTER("ha_tidesdb::external_lock"); + + if (lock_type != F_UNLCK) + { + /* We resolve per-statement THD shape once and cache to ensure_stmt_txn + reads the cache instead of re-calling thd_sql_command() and + thd_test_options(). */ + int sql_cmd = thd_sql_command(thd); + bool is_ddl = (sql_cmd == SQLCOM_ALTER_TABLE || sql_cmd == SQLCOM_CREATE_INDEX || + sql_cmd == SQLCOM_DROP_INDEX || sql_cmd == SQLCOM_TRUNCATE || + sql_cmd == SQLCOM_OPTIMIZE || sql_cmd == SQLCOM_CREATE_TABLE || + sql_cmd == SQLCOM_DROP_TABLE); + bool is_autocommit = !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN); + + cached_sql_cmd_ = sql_cmd; + cached_is_autocommit_ = is_autocommit; + cached_stmt_shape_valid_ = true; + stmt_is_update_or_delete_ = (sql_cmd == SQLCOM_UPDATE || sql_cmd == SQLCOM_UPDATE_MULTI || + sql_cmd == SQLCOM_DELETE || sql_cmd == SQLCOM_DELETE_MULTI); + + /* Anchor the per-statement back-pressure deadline so a multi-call + statement charges all its waits against the same budget. */ + { + ulong bp_ms = tdb_backpressure_timeout_ms(thd); + if (bp_ms > 0) + { + tdb_stmt_bp_deadline_ = + std::chrono::steady_clock::now() + std::chrono::milliseconds(bp_ms); + tdb_stmt_bp_deadline_valid_ = true; + } + } + + tidesdb_isolation_level_t effective_iso; + if (is_ddl || is_autocommit) + effective_iso = TDB_ISOLATION_READ_COMMITTED; + else + effective_iso = resolve_effective_isolation( + thd, share ? share->isolation_level : TDB_ISOLATION_SNAPSHOT); + tidesdb_trx_t *trx = get_or_create_trx(thd, ht, effective_iso); + if (!trx) DBUG_RETURN(HA_ERR_OUT_OF_MEM); + + stmt_txn = trx->txn; + stmt_txn_dirty = false; + stmt_has_write_lock_ |= (lock_type == F_WRLCK); + + /* We cache THD and trx pointers for fast access in hot paths + (index_read_map, update_row, delete_row, ensure_stmt_txn). + Eliminates ha_thd() virtual dispatch and thd_get_ha_data() + hash lookup on every row operation. */ + cached_thd_ = thd; + cached_trx_ = trx; + + trans_register_ha(thd, false, ht, 0); + + if (!is_autocommit) trans_register_ha(thd, true, ht, 0); + } + else + { + /* For multi-statement transactions (BEGIN...COMMIT), the txn stays + the same across statements. Preserve scan_iter and dup_iter_cache + across READ-ONLY statements so the next statement can reuse them + (avoids O(sstables) merge-heap rebuild). + After WRITE statements, iterators must be invalidated because + new txn ops (puts/deletes) are not visible to iterators created + before those ops were added. For autocommit, always free. */ + bool in_multi_stmt = + cached_stmt_shape_valid_ + ? !cached_is_autocommit_ + : (bool)thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN); + if (!in_multi_stmt || stmt_txn_dirty) + { + if (scan_iter) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + if (dup_iter_count_ > 0) free_dup_iter_cache(); + } + + /* We bump update_time once per write-statement for information_schema. + We use cached_time_ if available to avoid another time() syscall. */ + if (stmt_txn_dirty && share) + share->update_time.store(cached_time_valid_ ? cached_time_ : time(0), + std::memory_order_relaxed); + + /* We invalidate all per-statement caches so the next statement + picks up any changes (key rotation, session variable changes, + clock advance). */ + enc_key_ver_valid_ = false; + cached_time_valid_ = false; + cached_thdvars_valid_ = false; + + stmt_txn = NULL; + stmt_txn_dirty = false; + stmt_has_write_lock_ = false; + stmt_is_update_or_delete_ = false; + tdb_stmt_bp_deadline_valid_ = false; + cached_thd_ = NULL; + cached_trx_ = NULL; + + /* We invalidate statement shape cache last so the above checks still + see it. */ + cached_stmt_shape_valid_ = false; + } + + DBUG_RETURN(0); +} + +THR_LOCK_DATA **ha_tidesdb::store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type) +{ + /* With lock_count()=0 MariaDB skips THR_LOCK entirely. + store_lock is still called for informational purposes but we + do not push into the 'to' array (same pattern as InnoDB). + + However, we use this callback to detect locking reads + (SELECT ... FOR UPDATE, SELECT ... IN SHARE MODE) and + data-modifying statements. MariaDB calls store_lock() before + external_lock(), so we can set stmt_has_write_lock_ here for + the pessimistic row lock path. + + InnoDB uses store_lock() to set m_prebuilt->select_lock_type + to LOCK_S/LOCK_X for these cases. We emulate this by detecting + the same lock_type values and setting our write-lock flag. + + We flag locking READS (SELECT ... FOR UPDATE, SELECT ... IN + SHARE MODE) so the pessimistic row lock path in index_read_map() + acquires locks for serialization. + + SELECT ... FOR UPDATE passes lock_type >= TL_FIRST_WRITE. + SELECT ... IN SHARE MODE passes TL_READ_WITH_SHARED_LOCKS. + + Inside stored procedures, thd_sql_command() returns SQLCOM_CALL + (not SQLCOM_SELECT), so we cannot filter by SQL command. + Instead we use the lock_type directly. This means UPDATE/DELETE + statements also set stmt_has_write_lock_=true, but that's OK + because we removed lock acquisition from update_row()/delete_row() + - only index_read_map() acquires pessimistic locks now, and only + for PK exact matches (the SELECT ... FOR UPDATE pattern). */ + if (lock_type == TL_READ_WITH_SHARED_LOCKS || lock_type >= TL_FIRST_WRITE) + { + stmt_has_write_lock_ = true; + } + return to; +} + +/* ******************** Online DDL ******************** */ + +/* + Classify ALTER TABLE operations into INSTANT / INPLACE / COPY. + + INSTANT metadata-only changes (.frm rewrite, no engine work): + rename column/index, change default, change table options, + ADD COLUMN, DROP COLUMN (row format is self-describing via + the ROW_HEADER_MAGIC header written by serialize_row) + INPLACE add/drop secondary indexes (create/drop CFs, populate) + COPY column type changes, PK changes +*/ +enum_alter_inplace_result ha_tidesdb::check_if_supported_inplace_alter( + TABLE *altered_table, Alter_inplace_info *ha_alter_info) +{ + DBUG_ENTER("ha_tidesdb::check_if_supported_inplace_alter"); + + alter_table_operations flags = ha_alter_info->handler_flags; + + /* Operations that are pure metadata (INSTANT). + ADD/DROP COLUMN is instant because the packed row format includes + a header with the stored null_bytes and field_count, so + deserialize_row adapts to rows written with any prior schema. */ + static const alter_table_operations TIDESDB_INSTANT = + ALTER_COLUMN_NAME | ALTER_RENAME_COLUMN | ALTER_CHANGE_COLUMN_DEFAULT | + ALTER_COLUMN_DEFAULT | ALTER_COLUMN_OPTION | ALTER_CHANGE_CREATE_OPTION | + ALTER_DROP_CHECK_CONSTRAINT | ALTER_VIRTUAL_GCOL_EXPR | ALTER_RENAME | ALTER_RENAME_INDEX | + ALTER_INDEX_IGNORABILITY | ALTER_ADD_COLUMN | ALTER_DROP_COLUMN | + ALTER_STORED_COLUMN_ORDER | ALTER_VIRTUAL_COLUMN_ORDER; + + /* Operations we can do inplace (add/drop secondary indexes) */ + static const alter_table_operations TIDESDB_INPLACE_INDEX = + ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX | ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX | + ALTER_ADD_UNIQUE_INDEX | ALTER_DROP_UNIQUE_INDEX | ALTER_ADD_INDEX | ALTER_DROP_INDEX | + ALTER_INDEX_ORDER; + + /* If only instant operations, return INSTANT */ + if (!(flags & ~TIDESDB_INSTANT)) DBUG_RETURN(HA_ALTER_INPLACE_INSTANT); + + /* If only instant + index operations, return INPLACE with no lock. + TidesDB handles all concurrency via MVCC internally -- the index + population scan runs inside its own transaction and does not need + server-level MDL blocking. */ + if (!(flags & ~(TIDESDB_INSTANT | TIDESDB_INPLACE_INDEX))) + { + /**** Changing PK requires full rebuild */ + if (flags & (ALTER_ADD_PK_INDEX | ALTER_DROP_PK_INDEX)) + { + ha_alter_info->unsupported_reason = "TidesDB cannot change PRIMARY KEY inplace"; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + /* FULLTEXT and SPATIAL indexes ride ALTER_ADD_INDEX but cannot be + populated by the inplace builder -- the per-row loops in + inplace_alter_table skip them, so the CF would stay empty and + later MATCH AGAINST / MBRWithin would silently return no rows + against pre-existing data. Forcing COPY routes every row + through write_row, which knows how to maintain these CFs. */ + if (ha_alter_info->index_add_count > 0) + { + for (uint a = 0; a < ha_alter_info->index_add_count; a++) + { + uint key_num = ha_alter_info->index_add_buffer[a]; + KEY *new_key = &ha_alter_info->key_info_buffer[key_num]; + if (is_fts_index(new_key)) + { + ha_alter_info->unsupported_reason = "TidesDB cannot add FULLTEXT index inplace"; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + if (is_spatial_index(new_key)) + { + ha_alter_info->unsupported_reason = "TidesDB cannot add SPATIAL index inplace"; + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); + } + } + } + DBUG_RETURN(HA_ALTER_INPLACE_NO_LOCK); + } + + /* Everything else requires COPY */ + DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED); +} + +/* + Create CFs for newly added indexes. + Called with shared MDL lock (concurrent DML is allowed). +*/ +bool ha_tidesdb::prepare_inplace_alter_table(TABLE *altered_table, + Alter_inplace_info *ha_alter_info) +{ + DBUG_ENTER("ha_tidesdb::prepare_inplace_alter_table"); + + ha_tidesdb_inplace_ctx *ctx; + try + { + ctx = new ha_tidesdb_inplace_ctx(); + } + catch (...) + { + DBUG_RETURN(true); + } + ha_alter_info->handler_ctx = ctx; + + tidesdb_column_family_config_t cfg = build_cf_config(TDB_TABLE_OPTIONS(table)); + + std::string base_cf = share->cf_name; + + if (ha_alter_info->index_add_count > 0) + { + for (uint a = 0; a < ha_alter_info->index_add_count; a++) + { + uint key_num = ha_alter_info->index_add_buffer[a]; + KEY *new_key = &ha_alter_info->key_info_buffer[key_num]; + + if (new_key->flags & HA_NOSAME && + altered_table->s->primary_key < altered_table->s->keys && + key_num == altered_table->s->primary_key) + continue; + + std::string idx_cf = base_cf + CF_INDEX_INFIX + new_key->name.str; + + tidesdb_drop_column_family(tdb_global, idx_cf.c_str()); + + tidesdb_column_family_config_t idx_cfg = cfg; + ha_index_option_struct *iopts = new_key->option_struct; + if (iopts) idx_cfg.use_btree = iopts->use_btree ? 1 : 0; + + int rc = tidesdb_create_column_family(tdb_global, idx_cf.c_str(), &idx_cfg); + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] inplace ADD INDEX: failed to create CF '%s' (err=%d)", + idx_cf.c_str(), rc); + my_error(ER_INTERNAL_ERROR, MYF(0), "[TIDESDB] failed to create index CF"); + DBUG_RETURN(true); + } + + tidesdb_column_family_t *icf = tidesdb_get_column_family(tdb_global, idx_cf.c_str()); + if (!icf) + { + sql_print_error("[TIDESDB] inplace ADD INDEX: CF '%s' not found after create", + idx_cf.c_str()); + my_error(ER_INTERNAL_ERROR, MYF(0), "[TIDESDB] index CF not found after create"); + DBUG_RETURN(true); + } + + ctx->add_cfs.push_back(icf); + ctx->add_cf_names.push_back(idx_cf); + ctx->add_key_nums.push_back(key_num); + } + } + + if (ha_alter_info->index_drop_count > 0) + { + for (uint d = 0; d < ha_alter_info->index_drop_count; d++) + { + KEY *old_key = ha_alter_info->index_drop_buffer[d]; + uint old_key_num = (uint)(old_key - table->key_info); + if (old_key_num < share->idx_cf_names.size() && + !share->idx_cf_names[old_key_num].empty()) + { + ctx->drop_cf_names.push_back(share->idx_cf_names[old_key_num]); + } + } + } + + DBUG_RETURN(false); +} + +/* + Inplace phase -- we populate newly added indexes by scanning the table. + Called with no MDL lock blocking (HA_ALTER_INPLACE_NO_LOCK). +*/ +bool ha_tidesdb::inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info) +{ + DBUG_ENTER("ha_tidesdb::inplace_alter_table"); + + ha_tidesdb_inplace_ctx *ctx = static_cast(ha_alter_info->handler_ctx); + + if (!ctx || ctx->add_cfs.empty()) + DBUG_RETURN(false); /* Nothing to populate (drop-only or instant) */ + + /* We mark all columns readable on the altered table since we read + fields via make_sort_key_part during index key construction. */ + MY_BITMAP *old_map = tmp_use_all_columns(altered_table, &altered_table->read_set); + + /* We do a full table scan to populate the new secondary indexes. + We use the altered_table's key_info for building index keys, + since that matches the new key numbering. */ + + /* We always use READ_COMMITTED for index population. The scan reads + potentially millions of rows; higher isolation levels would track + each key in the read-set, causing unbounded memory growth. Index + builds are DDL and never need OCC conflict detection. */ + tidesdb_txn_t *txn = NULL; + int rc = tidesdb_txn_begin_with_isolation(tdb_global, TDB_ISOLATION_READ_COMMITTED, &txn); + if (rc != TDB_SUCCESS || !txn) + { + sql_print_error("[TIDESDB] inplace ADD INDEX: txn_begin failed (err=%d)", rc); + my_error(ER_INTERNAL_ERROR, MYF(0), "[TIDESDB] failed to begin txn for index build"); + tmp_restore_column_map(&altered_table->read_set, old_map); + DBUG_RETURN(true); + } + + tidesdb_iter_t *iter = NULL; + rc = tdb_iter_new_blocking(ha_thd(), txn, share->cf, &iter); + if (rc != TDB_SUCCESS || !iter) + { + tidesdb_txn_free(txn); + sql_print_error("[TIDESDB] inplace ADD INDEX: iter_new failed (err=%d)", rc); + my_error(ER_INTERNAL_ERROR, MYF(0), "[TIDESDB] failed to create iterator for index build"); + tmp_restore_column_map(&altered_table->read_set, old_map); + DBUG_RETURN(true); + } + tidesdb_iter_seek_to_first(iter); + + ha_rows rows_processed = 0; + + /* For UNIQUE indexes, we track seen index-column prefixes to detect + duplicates. If a duplicate is found we must abort the ALTER. + unordered_set gives O(1) amortized lookup vs O(log n) for std::set, + which matters for tables with millions of rows. */ + std::vector idx_is_unique(ctx->add_cfs.size(), false); + std::vector> idx_seen(ctx->add_cfs.size()); + for (uint a = 0; a < ctx->add_cfs.size(); a++) + { + uint key_num = ctx->add_key_nums[a]; + KEY *ki = &altered_table->key_info[key_num]; + if (ki->flags & HA_NOSAME) idx_is_unique[a] = true; + } + + /* We remember the last data key so we can seek directly to it after + a batch commit (O(n²)). */ + uchar last_data_key[DATA_KEY_BUF_LEN]; + size_t last_data_key_len = 0; + + while (tidesdb_iter_valid(iter)) + { + uint8_t *key_data = NULL; + size_t key_size = 0; + uint8_t *val_data = NULL; + size_t val_size = 0; + + if (tidesdb_iter_key_value(iter, &key_data, &key_size, &val_data, &val_size) != TDB_SUCCESS) + { + tidesdb_iter_next(iter); + continue; + } + + if (key_size < KEY_NAMESPACE_LEN || key_data[0] != KEY_NS_DATA) + { + tidesdb_iter_next(iter); + continue; + } + + if (key_size <= sizeof(last_data_key)) + { + memcpy(last_data_key, key_data, key_size); + last_data_key_len = key_size; + } + + const uchar *pk = key_data + KEY_NAMESPACE_LEN; + uint pk_len = (uint)(key_size - KEY_NAMESPACE_LEN); + + /* We decode the row into table->record[0]. The field pointers from + altered_table->key_info will be temporarily repointed (via + move_field_offset) to read from this buffer. */ + if (share->has_blobs || share->encrypted) + { + std::string row_data((const char *)val_data, val_size); + deserialize_row(table->record[0], row_data); + } + else + { + deserialize_row(table->record[0], (const uchar *)val_data, val_size); + } + + /* For each newly added index, build the index entry key. + altered_table->key_info fields have ptr into altered_table->record[0], + but the data lives in table->record[0]. We compute ptdiff to + rebase field pointers to read from the correct buffer. + Key format matches make_comparable_key()= [null_byte] + sort_string. */ + my_ptrdiff_t ptdiff = (my_ptrdiff_t)(table->record[0] - altered_table->record[0]); + + for (uint a = 0; a < ctx->add_cfs.size(); a++) + { + uint key_num = ctx->add_key_nums[a]; + KEY *ki = &altered_table->key_info[key_num]; + + /* FULLTEXT and SPATIAL indexes use different population paths */ + if (is_fts_index(ki)) continue; + if (is_spatial_index(ki)) continue; + + uchar ik[SEC_IDX_KEY_BUF_LEN]; + uint pos = 0; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + KEY_PART_INFO *kp = &ki->key_part[p]; + Field *field = kp->field; + + field->move_field_offset(ptdiff); + if (field->real_maybe_null()) + { + if (field->is_null()) + { + ik[pos++] = SORT_KEY_NULL; + bzero(ik + pos, kp->length); + pos += kp->length; + field->move_field_offset(-ptdiff); + continue; + } + ik[pos++] = SORT_KEY_NOT_NULL; + } + field->sort_string(ik + pos, kp->length); + field->move_field_offset(-ptdiff); + pos += kp->length; + } + + if (idx_is_unique[a]) + { + std::string prefix((const char *)ik, pos); + if (!idx_seen[a].insert(prefix).second) + { + tidesdb_iter_free(iter); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + tmp_restore_column_map(&altered_table->read_set, old_map); + my_error(ER_DUP_ENTRY, MYF(0), "?", altered_table->key_info[key_num].name.str); + DBUG_RETURN(true); + } + } + + memcpy(ik + pos, pk, pk_len); + pos += pk_len; + + rc = tdb_txn_put_blocking(ha_thd(), txn, ctx->add_cfs[a], ik, pos, &tdb_empty_val, + sizeof(tdb_empty_val), TIDESDB_TTL_NONE); + if (rc != TDB_SUCCESS) + { + /* A per-row put failure leaves a hole in the new index. + Continuing would let commit_inplace_alter_table report + success on an index that silently lacks rows, matching + the failure mode the batch-commit guard below also + refuses to ship. Abort the ALTER instead. */ + sql_print_error( + "[TIDESDB] inplace ADD INDEX: put failed for key %u (err=%d), " + "aborting to avoid a partial index", + key_num, rc); + tidesdb_iter_free(iter); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + tmp_restore_column_map(&altered_table->read_set, old_map); + my_error(ER_INTERNAL_ERROR, MYF(0), + "[TIDESDB] per-row put failed during index build"); + DBUG_RETURN(true); + } + } + + rows_processed++; + + /* We check for KILL signal periodically so the user can cancel + long-running index builds via KILL . */ + if ((rows_processed % TIDESDB_INDEX_BUILD_BATCH) == 0 && thd_killed(ha_thd())) + { + tidesdb_iter_free(iter); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + tmp_restore_column_map(&altered_table->read_set, old_map); + my_error(ER_QUERY_INTERRUPTED, MYF(0)); + DBUG_RETURN(true); + } + + if (rows_processed % TIDESDB_INDEX_BUILD_BATCH == 0) + { + { + int crc = tidesdb_txn_commit(txn); + if (crc != TDB_SUCCESS) + { + /* A failed batch commit drops this batch of index entries. + Carrying on would finish the build and report success + with an index that is silently missing rows, so abort + the ALTER instead. */ + sql_print_error( + "[TIDESDB] inplace ADD INDEX: batch commit failed rc=%d, " + "aborting to avoid a partial index", + crc); + tidesdb_iter_free(iter); + tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + tmp_restore_column_map(&altered_table->read_set, old_map); + my_error(ER_INTERNAL_ERROR, MYF(0), + "[TIDESDB] batch commit failed during index build"); + DBUG_RETURN(true); + } + } + tidesdb_iter_free(iter); + + /* We reset the txn with READ_COMMITTED -- index builds + don't need snapshot consistency across batches. */ + int rrc = tidesdb_txn_reset(txn, TDB_ISOLATION_READ_COMMITTED); + if (rrc != TDB_SUCCESS) + { + sql_print_warning( + "[TIDESDB] inplace ADD INDEX: tidesdb_txn_reset failed (rc=%d), " + "falling back to free+begin", + rrc); + tidesdb_txn_free(txn); + txn = NULL; + rc = tidesdb_txn_begin_with_isolation(tdb_global, TDB_ISOLATION_READ_COMMITTED, + &txn); + if (rc != TDB_SUCCESS || !txn) + { + sql_print_error("[TIDESDB] inplace ADD INDEX: batch txn_begin failed"); + my_error(ER_INTERNAL_ERROR, MYF(0), + "[TIDESDB] batch txn failed during index build"); + tmp_restore_column_map(&altered_table->read_set, old_map); + DBUG_RETURN(true); + } + } + iter = NULL; + rc = tdb_iter_new_blocking(ha_thd(), txn, share->cf, &iter); + if (rc != TDB_SUCCESS || !iter) + { + tidesdb_txn_free(txn); + my_error(ER_INTERNAL_ERROR, MYF(0), + "[TIDESDB] batch iter failed during index build"); + tmp_restore_column_map(&altered_table->read_set, old_map); + DBUG_RETURN(true); + } + int src = tidesdb_iter_seek(iter, last_data_key, last_data_key_len); + if (src != TDB_SUCCESS) + { + sql_print_warning("[TIDESDB] inplace ADD INDEX: iter_seek failed rc=%d", src); + break; /* end scan gracefully */ + } + if (tidesdb_iter_valid(iter)) tidesdb_iter_next(iter); + continue; /* Don't call iter_next again */ + } + + tidesdb_iter_next(iter); + } + + tidesdb_iter_free(iter); + + rc = tidesdb_txn_commit(txn); + if (rc != TDB_SUCCESS) tidesdb_txn_rollback(txn); + tidesdb_txn_free(txn); + + if (rc != TDB_SUCCESS) + { + sql_print_error("[TIDESDB] inplace ADD INDEX: final commit failed (err=%d)", rc); + my_error(ER_INTERNAL_ERROR, MYF(0), "[TIDESDB] final commit failed during index build"); + tmp_restore_column_map(&altered_table->read_set, old_map); + DBUG_RETURN(true); + } + tmp_restore_column_map(&altered_table->read_set, old_map); + DBUG_RETURN(false); +} + +/* + Commit or rollback the inplace ALTER. + On commit drop old index CFs, update share->idx_cfs for new table shape. + On rollback drop newly created CFs. +*/ +bool ha_tidesdb::commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info, + bool commit) +{ + DBUG_ENTER("ha_tidesdb::commit_inplace_alter_table"); + + ha_tidesdb_inplace_ctx *ctx = static_cast(ha_alter_info->handler_ctx); + + ha_alter_info->group_commit_ctx = NULL; + + if (!ctx) DBUG_RETURN(false); + + /* We free any cached iterators before dropping CFs. The connection's + scan_iter and dup_iter_cache_ may hold merge-heap references to + SSTables in CFs about to be dropped. */ + if (scan_iter) + { + tidesdb_iter_free(scan_iter); + scan_iter = NULL; + scan_iter_cf_ = NULL; + scan_iter_txn_ = NULL; + } + free_dup_iter_cache(); + + if (!commit) + { + /* Rollback, we drop any CFs we created for new indexes */ + for (const auto &cf_name : ctx->add_cf_names) + tidesdb_drop_column_family(tdb_global, cf_name.c_str()); + DBUG_RETURN(false); + } + + /* Commit, we drop CFs for removed indexes */ + for (const auto &cf_name : ctx->drop_cf_names) + { + int rc = tidesdb_drop_column_family(tdb_global, cf_name.c_str()); + if (rc != TDB_SUCCESS && rc != TDB_ERR_NOT_FOUND) + sql_print_warning("[TIDESDB] commit ALTER: failed to drop CF '%s' (err=%d)", + cf_name.c_str(), rc); + } + + /* We rebuild share->idx_cfs and idx_cf_names based on the new table's keys. + Since we hold exclusive MDL, no other handler is using the share. */ + lock_shared_ha_data(); + share->idx_cfs.clear(); + share->idx_cf_names.clear(); + + uint new_pk = altered_table->s->primary_key; + for (uint i = 0; i < altered_table->s->keys; i++) + { + if (new_pk != MAX_KEY && i == new_pk) + { + share->idx_cfs.push_back(NULL); + share->idx_cf_names.push_back(""); + continue; + } + std::string idx_name; + tidesdb_column_family_t *icf = resolve_idx_cf( + tdb_global, share->cf_name, altered_table->key_info[i].name.str, idx_name); + share->idx_cfs.push_back(icf); + share->idx_cf_names.push_back(idx_name); + } + + for (uint i = 0; i < altered_table->s->keys; i++) + { + share->idx_comp_key_len[i] = comparable_key_length(&altered_table->key_info[i]); + share->idx_is_fts[i] = is_fts_index(&altered_table->key_info[i]); + share->idx_is_spatial[i] = is_spatial_index(&altered_table->key_info[i]); + } + + share->idx_cover.assign(altered_table->s->keys, + std::vector(altered_table->s->fields, false)); + for (uint i = 0; i < altered_table->s->keys; i++) + { + const KEY *ki = &altered_table->key_info[i]; + for (uint p = 0; p < ki->user_defined_key_parts; p++) + { + uint fnr = ki->key_part[p].fieldnr; + if (fnr > 0 && fnr - 1 < altered_table->s->fields) share->idx_cover[i][fnr - 1] = true; + } + if (altered_table->s->primary_key != MAX_KEY && i != altered_table->s->primary_key) + { + const KEY *pk_key = &altered_table->key_info[altered_table->s->primary_key]; + for (uint p = 0; p < pk_key->user_defined_key_parts; p++) + { + uint fnr = pk_key->key_part[p].fieldnr; + if (fnr > 0 && fnr - 1 < altered_table->s->fields) + share->idx_cover[i][fnr - 1] = true; + } + } + } + share->num_secondary_indexes = 0; + for (uint i = 0; i < share->idx_cfs.size(); i++) + if (share->idx_cfs[i]) share->num_secondary_indexes++; + + /* If table options changed (SYNC_MODE, COMPRESSION, BLOOM_FPR, etc.), + we apply them to the live CF(s) so they take effect immediately instead + of only being persisted in the .frm. */ + if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) + { + tidesdb_column_family_config_t cfg = build_cf_config(TDB_TABLE_OPTIONS(altered_table)); + + /* Main data CF */ + if (share->cf) + { + int rc = tidesdb_cf_update_runtime_config(share->cf, &cfg, 1); + if (rc != TDB_SUCCESS) + sql_print_warning( + "[TIDESDB] ALTER: failed to update runtime config for " + "data CF '%s' (err=%d)", + share->cf_name.c_str(), rc); + } + + for (uint i = 0; i < share->idx_cfs.size(); i++) + { + if (share->idx_cfs[i]) + { + tidesdb_column_family_config_t idx_cfg = cfg; + if (i < altered_table->s->keys && altered_table->key_info[i].option_struct) + { + ha_index_option_struct *iopts = altered_table->key_info[i].option_struct; + idx_cfg.use_btree = iopts->use_btree ? 1 : 0; + } + + int rc = tidesdb_cf_update_runtime_config(share->idx_cfs[i], &idx_cfg, 1); + if (rc != TDB_SUCCESS) + sql_print_warning( + "[TIDESDB] ALTER: failed to update runtime config for " + "index CF '%s' (err=%d)", + share->idx_cf_names[i].c_str(), rc); + } + } + + if (TDB_TABLE_OPTIONS(altered_table)) + { + uint iso_idx = TDB_TABLE_OPTIONS(altered_table)->isolation_level; + if (iso_idx < array_elements(tdb_isolation_map)) + share->isolation_level = (tidesdb_isolation_level_t)tdb_isolation_map[iso_idx]; + share->default_ttl = TDB_TABLE_OPTIONS(altered_table)->ttl; + share->has_ttl = (share->default_ttl > 0 || share->ttl_field_idx >= 0); + share->encrypted = TDB_TABLE_OPTIONS(altered_table)->encrypted; + if (share->encrypted) + share->encryption_key_id = + (uint)TDB_TABLE_OPTIONS(altered_table)->encryption_key_id; + } + } + + share->stats_refresh_us.store(0, std::memory_order_relaxed); + unlock_shared_ha_data(); + + /* We update .frm in schema CF after ALTER. When discover_table is + registered MariaDB may skip writing .frm to disk, so prefer the + in-memory image from the altered TABLE_SHARE. */ + if (altered_table->s->frm_image) + schema_cf_store_frm(table->s->path.str, altered_table->s->frm_image->str, + altered_table->s->frm_image->length); + else + schema_cf_store_frm(table->s->path.str); + + DBUG_RETURN(false); +} + +/* + Tell MariaDB whether changing table options requires a rebuild. + For TidesDB, changing options like SYNC_MODE, TTL, etc. is always + compatible -- the .frm is rewritten and re-read on next open(). +*/ +bool ha_tidesdb::check_if_incompatible_data(HA_CREATE_INFO *create_info, uint table_changes) +{ + /* If only table options changed (not column types), data is compatible */ + if (table_changes == IS_EQUAL_YES) return COMPATIBLE_DATA_YES; + return COMPATIBLE_DATA_NO; +} + +/* ******************** rename_table (ALTER TABLE / RENAME) ******************** */ + +int ha_tidesdb::rename_table(const char *from, const char *to) +{ + DBUG_ENTER("ha_tidesdb::rename_table"); + + std::string old_cf = path_to_cf_name(from); + std::string new_cf = path_to_cf_name(to); + + /* If the destination CF already exists (stale from a previous ALTER), + drop it first so the rename can proceed. */ + tidesdb_drop_column_family(tdb_global, new_cf.c_str()); + + int rc = tidesdb_rename_column_family(tdb_global, old_cf.c_str(), new_cf.c_str()); + if (rc != TDB_SUCCESS && rc != TDB_ERR_NOT_FOUND) + { + sql_print_error("[TIDESDB] Failed to rename CF '%s' -> '%s' (err=%d)", old_cf.c_str(), + new_cf.c_str(), rc); + DBUG_RETURN(tdb_rc_to_ha(rc, "rename_table")); + } + + { + std::string prefix = old_cf + CF_INDEX_INFIX; + char **names = NULL; + int count = 0; + if (tidesdb_list_column_families(tdb_global, &names, &count) == TDB_SUCCESS && names) + { + for (int i = 0; i < count; i++) + { + if (!names[i]) continue; + std::string cf_str(names[i]); + if (cf_str.compare(0, prefix.size(), prefix) == 0) + { + std::string suffix = cf_str.substr(prefix.size()); + std::string new_idx = new_cf + CF_INDEX_INFIX + suffix; + + tidesdb_drop_column_family(tdb_global, new_idx.c_str()); + rc = tidesdb_rename_column_family(tdb_global, cf_str.c_str(), new_idx.c_str()); + if (rc != TDB_SUCCESS && rc != TDB_ERR_NOT_FOUND) + sql_print_error("[TIDESDB] Failed to rename idx CF '%s' -> '%s' (err=%d)", + cf_str.c_str(), new_idx.c_str(), rc); + } + tidesdb_free(names[i]); + } + tidesdb_free(names); + } + } + + schema_cf_rename(from, to); + + DBUG_RETURN(0); +} + +/* ******************** delete_table (DROP TABLE) ******************** */ + +/* + Force-remove a directory tree from disk. Used as a safety net after + tidesdb_drop_column_family() because the library's internal + remove_directory() can fail silently (e.g. open fds from block cache, + mmap, or background workers). If stale SSTables survive, the next + CREATE TABLE with the same name inherits them -- catastrophic for + performance (bloom filters pass on every SSTable since keys overlap). +*/ +static void force_remove_cf_dir(const std::string &cf_name) +{ + char dir[FN_REFLEN]; + const char sep[] = {FN_LIBCHAR, 0}; + strxnmov(dir, sizeof(dir) - 1, tdb_path.c_str(), sep, cf_name.c_str(), NullS); + + MY_STAT st; + if (!my_stat(dir, &st, MYF(0))) return; /* already gone */ + + /* my_rmtree() is MariaDB's portable recursive directory removal + (handles Windows, symlinks, read-only attrs, etc.). */ + if (my_rmtree(dir, MYF(0)) != 0) + sql_print_warning("[TIDESDB] force_remove_cf_dir failed for %s", dir); +} + +/* + Shared drop logic used by both the handlerton callback (hton->drop_table) + and the handler method (ha_tidesdb::delete_table). Drops the main data CF + and all secondary index CFs, then force-removes their directories. + Returns 0 on success. +*/ +static int tidesdb_drop_table_impl(const char *path) +{ + if (!tdb_global) return 0; + + /* Replica mode is read-only against the object store, so the library + rejects tidesdb_drop_column_family with TDB_ERR_READONLY. MariaDB's + own init/upgrade paths invoke drop_table on stale system tables and + repeatedly trigger that rejection, which surfaces as scary [ERROR] + lines in the server log even though the work is genuinely a no-op + for a replica. Skip the library call entirely on replicas and let + the local directory cleanup (if any) be driven by the next sync. */ + if (srv_replica_mode) + { + sql_print_information( + "[TIDESDB] drop_table skipped on replica for '%s' (replica is read-only)", path); + return 0; + } + + std::string cf_name = ha_tidesdb::path_to_cf_name(path); + + /* We collect secondary index CF names before dropping so we can + force-remove their directories afterwards. */ + std::vector idx_cf_names; + { + std::string prefix = cf_name + CF_INDEX_INFIX; + char **names = NULL; + int count = 0; + if (tidesdb_list_column_families(tdb_global, &names, &count) == TDB_SUCCESS && names) + { + for (int i = 0; i < count; i++) + { + if (!names[i]) continue; + if (strncmp(names[i], prefix.c_str(), prefix.size()) == 0) + idx_cf_names.push_back(names[i]); + tidesdb_free(names[i]); + } + tidesdb_free(names); + } + } + + int rc = tidesdb_drop_column_family(tdb_global, cf_name.c_str()); + if (rc != TDB_SUCCESS && rc != TDB_ERR_NOT_FOUND) + { + sql_print_error("[TIDESDB] Failed to drop CF '%s' (err=%d)", cf_name.c_str(), rc); + return rc; + } + + for (const auto &idx_name : idx_cf_names) + tidesdb_drop_column_family(tdb_global, idx_name.c_str()); + + force_remove_cf_dir(cf_name); + for (const auto &idx_name : idx_cf_names) force_remove_cf_dir(idx_name); + + schema_cf_delete(path); + + return 0; +} + +/* + Handlerton-level drop_table callback. MariaDB 12.x calls hton->drop_table + instead of handler::delete_table. Must return 0 on success, not -1. +*/ +static int tidesdb_hton_drop_table(handlerton *, const char *path) +{ + return tidesdb_drop_table_impl(path); +} + +/* + Extract the database name from a directory path handed to drop_database. + The server passes something like "./test/" or "/var/lib/mysql/test/"; + we strip trailing separators and return the final path component. +*/ +static std::string tidesdb_path_to_db_name(const char *path) +{ + if (!path) return std::string(); + std::string p(path); + while (!p.empty() && (p.back() == FN_LIBCHAR || p.back() == '/')) p.pop_back(); + size_t slash = p.find_last_of("/\\"); + if (slash != std::string::npos) p = p.substr(slash + 1); + return p; +} + +/* + Handlerton-level drop_database callback. MariaDB calls this when the + server-side DROP DATABASE has finished removing .frm files from the db + directory. Without this hook, TidesDB column families whose .frm was + already unlinked (and any object-store-mode entries in schema_cf) would + outlive the database and accumulate on disk. + + We enumerate every CF whose name starts with "__" (the prefix + path_to_cf_name builds for a table in that database -- which also + captures all "db__tbl__idx_*" secondary-index CFs) and drop each. +*/ +static void tidesdb_hton_drop_database(handlerton *, char *path) +{ + if (!tdb_global || !path) return; + + /* Same rationale as tidesdb_drop_table_impl -- replica mode is + read-only and the library rejects every drop with TDB_ERR_READONLY, + so skip the call rather than spamming the log. */ + if (srv_replica_mode) + { + sql_print_information( + "[TIDESDB] drop_database skipped on replica for '%s' (replica is read-only)", path); + return; + } + + std::string db = tidesdb_path_to_db_name(path); + if (db.empty()) return; + + std::string prefix = db + CF_DB_TABLE_SEP; + + std::vector to_drop; + { + char **names = NULL; + int count = 0; + if (tidesdb_list_column_families(tdb_global, &names, &count) == TDB_SUCCESS && names) + { + for (int i = 0; i < count; i++) + { + if (!names[i]) continue; + if (strncmp(names[i], prefix.c_str(), prefix.size()) == 0) + to_drop.emplace_back(names[i]); + tidesdb_free(names[i]); + } + tidesdb_free(names); + } + } + + for (const auto &cf_name : to_drop) + { + int rc = tidesdb_drop_column_family(tdb_global, cf_name.c_str()); + if (rc != TDB_SUCCESS && rc != TDB_ERR_NOT_FOUND) + sql_print_warning("[TIDESDB] drop_database: failed to drop CF '%s' (err=%d)", + cf_name.c_str(), rc); + force_remove_cf_dir(cf_name); + } + + /* We clean up schema CF entries for this database (object-store mode). + No-op when schema_cf is NULL (local-only mode). */ + schema_cf_delete_db(db); + + if (!to_drop.empty()) + sql_print_information("[TIDESDB] drop_database: removed %zu column famil%s for '%s'", + to_drop.size(), to_drop.size() == 1 ? "y" : "ies", db.c_str()); +} + +int ha_tidesdb::delete_table(const char *name) +{ + DBUG_ENTER("ha_tidesdb::delete_table"); + DBUG_RETURN(tidesdb_drop_table_impl(name)); +} + +/* ******************** Status variables (SHOW GLOBAL STATUS LIKE 'tidesdb%') ******************** + */ + +/* Static holders for status variable values. Populated by the SHOW_FUNC + callback which queries tidesdb_get_db_stats / tidesdb_get_cache_stats. + These are global (not per-connection) since they reflect database-wide state. */ +static long long srv_stat_column_families; +static long long srv_stat_global_seq; +static long long srv_stat_memtable_bytes; +static long long srv_stat_txn_memory_bytes; +static long long srv_stat_memory_limit; +static long long srv_stat_memory_pressure; +static long long srv_stat_total_sstables; +static long long srv_stat_open_sstables; +static long long srv_stat_data_size_bytes; +static long long srv_stat_immutable_memtables; +static long long srv_stat_flush_pending; +static long long srv_stat_flush_queue; +static long long srv_stat_compaction_queue; +static long long srv_stat_cache_entries; +static long long srv_stat_cache_bytes; +static long long srv_stat_cache_hits; +static long long srv_stat_cache_misses; +static double srv_stat_cache_hit_rate; +static long long srv_stat_cache_partitions; +/* Tombstone aggregates are forward-declared near the top of this file so + tidesdb_show_status can read them directly. Their definitions live up + there. */ + +#define TIDESQL_VERSION_STR "4.5.4" +#define TIDESQL_VERSION_HEX 0x40504 + +static const char *srv_stat_version = TIDESQL_VERSION_STR; +static long long srv_stat_version_hex = TIDESQL_VERSION_HEX; + +static struct st_mysql_show_var tidesdb_status_variables[] = { + {"tidesdb_version", (char *)&srv_stat_version, SHOW_CHAR_PTR}, + {"tidesdb_version_hex", (char *)&srv_stat_version_hex, SHOW_LONGLONG}, + {"tidesdb_column_families", (char *)&srv_stat_column_families, SHOW_LONGLONG}, + {"tidesdb_global_sequence", (char *)&srv_stat_global_seq, SHOW_LONGLONG}, + {"tidesdb_memtable_bytes", (char *)&srv_stat_memtable_bytes, SHOW_LONGLONG}, + {"tidesdb_txn_memory_bytes", (char *)&srv_stat_txn_memory_bytes, SHOW_LONGLONG}, + {"tidesdb_memory_limit", (char *)&srv_stat_memory_limit, SHOW_LONGLONG}, + {"tidesdb_memory_pressure", (char *)&srv_stat_memory_pressure, SHOW_LONGLONG}, + {"tidesdb_total_sstables", (char *)&srv_stat_total_sstables, SHOW_LONGLONG}, + {"tidesdb_open_sstables", (char *)&srv_stat_open_sstables, SHOW_LONGLONG}, + {"tidesdb_data_size_bytes", (char *)&srv_stat_data_size_bytes, SHOW_LONGLONG}, + {"tidesdb_immutable_memtables", (char *)&srv_stat_immutable_memtables, SHOW_LONGLONG}, + {"tidesdb_flush_pending", (char *)&srv_stat_flush_pending, SHOW_LONGLONG}, + {"tidesdb_flush_queue", (char *)&srv_stat_flush_queue, SHOW_LONGLONG}, + {"tidesdb_compaction_queue", (char *)&srv_stat_compaction_queue, SHOW_LONGLONG}, + {"tidesdb_cache_entries", (char *)&srv_stat_cache_entries, SHOW_LONGLONG}, + {"tidesdb_cache_bytes", (char *)&srv_stat_cache_bytes, SHOW_LONGLONG}, + {"tidesdb_cache_hits", (char *)&srv_stat_cache_hits, SHOW_LONGLONG}, + {"tidesdb_cache_misses", (char *)&srv_stat_cache_misses, SHOW_LONGLONG}, + {"tidesdb_cache_hit_rate", (char *)&srv_stat_cache_hit_rate, SHOW_DOUBLE}, + {"tidesdb_cache_partitions", (char *)&srv_stat_cache_partitions, SHOW_LONGLONG}, + {"tidesdb_total_tombstones", (char *)&srv_stat_total_tombstones, SHOW_LONGLONG}, + {"tidesdb_tombstone_ratio", (char *)&srv_stat_tombstone_ratio, SHOW_DOUBLE}, + {"tidesdb_max_sst_tombstone_density", (char *)&srv_stat_max_sst_density, SHOW_DOUBLE}, + {"tidesdb_max_sst_tombstone_density_level", (char *)&srv_stat_max_sst_density_level, + SHOW_LONGLONG}, + {"tidesdb_backpressure_waits", (char *)&srv_stat_backpressure_waits, SHOW_LONGLONG}, + {"tidesdb_backpressure_wait_us", (char *)&srv_stat_backpressure_wait_us, SHOW_LONGLONG}, + {"tidesdb_lock_waits", (char *)&srv_stat_lock_waits, SHOW_LONGLONG}, + {"tidesdb_lock_wait_us", (char *)&srv_stat_lock_wait_us, SHOW_LONGLONG}, + {"tidesdb_lock_deadlocks", (char *)&srv_stat_lock_deadlocks, SHOW_LONGLONG}, + {"tidesdb_lock_timeouts", (char *)&srv_stat_lock_timeouts, SHOW_LONGLONG}, + {"tidesdb_lock_held", (char *)&srv_stat_lock_held, SHOW_LONGLONG}, + {"tidesdb_lock_entries", (char *)&srv_stat_lock_entries, SHOW_LONGLONG}, + {"tidesdb_lock_entry_recycles", (char *)&srv_stat_lock_entry_recycles, SHOW_LONGLONG}, + {"tidesdb_lock_chain_max", (char *)&srv_stat_lock_chain_max, SHOW_LONGLONG}, + {NullS, NullS, SHOW_ULONG}}; + +/* Refresh the static status variables from live tidesdb stats. Cost is + paid by the caller (SHOW ENGINE STATUS / SHOW GLOBAL STATUS), never on + the write path. */ +static void tidesdb_refresh_status_vars() +{ + if (!tdb_global) return; + + tidesdb_db_stats_t db_st; + memset(&db_st, 0, sizeof(db_st)); + tidesdb_get_db_stats(tdb_global, &db_st); + + tidesdb_cache_stats_t cache_st; + memset(&cache_st, 0, sizeof(cache_st)); + tidesdb_get_cache_stats(tdb_global, &cache_st); + + srv_stat_column_families = db_st.num_column_families; + srv_stat_global_seq = (long long)db_st.global_seq; + srv_stat_memtable_bytes = (long long)db_st.total_memtable_bytes; + srv_stat_txn_memory_bytes = (long long)db_st.txn_memory_bytes; + srv_stat_memory_limit = (long long)db_st.resolved_memory_limit; + srv_stat_memory_pressure = db_st.memory_pressure_level; + srv_stat_total_sstables = db_st.total_sstable_count; + srv_stat_open_sstables = db_st.num_open_sstables; + srv_stat_data_size_bytes = (long long)db_st.total_data_size_bytes; + srv_stat_immutable_memtables = db_st.total_immutable_count; + srv_stat_flush_pending = db_st.flush_pending_count; + srv_stat_flush_queue = (long long)db_st.flush_queue_size; + srv_stat_compaction_queue = (long long)db_st.compaction_queue_size; + srv_stat_cache_entries = (long long)cache_st.total_entries; + srv_stat_cache_bytes = (long long)cache_st.total_bytes; + srv_stat_cache_hits = (long long)cache_st.hits; + srv_stat_cache_misses = (long long)cache_st.misses; + srv_stat_cache_hit_rate = cache_st.hit_rate * PERCENT_SCALE; + srv_stat_cache_partitions = (long long)cache_st.num_partitions; + + /* Tombstone aggregates -- we walk every CF once, summing total_tombstones + and tracking the worst single-SSTable density. + tidesdb_db_stats_t does not surface tombstone counters, so the CF + list is iterated here. SHOW GLOBAL STATUS reads the resulting + statics. */ + char **cf_names = NULL; + int cf_count = 0; + if (tidesdb_list_column_families(tdb_global, &cf_names, &cf_count) == TDB_SUCCESS && cf_names) + { + uint64_t total_tomb = 0, total_keys = 0; + double max_density = 0.0; + int max_density_level = 0; + for (int i = 0; i < cf_count; i++) + { + if (!cf_names[i]) continue; + tidesdb_column_family_t *cf = tidesdb_get_column_family(tdb_global, cf_names[i]); + if (!cf) continue; + tidesdb_stats_t *st = NULL; + if (tidesdb_get_stats(cf, &st) == TDB_SUCCESS && st) + { + total_tomb += st->total_tombstones; + total_keys += st->total_keys; + if (st->max_sst_density > max_density) + { + max_density = st->max_sst_density; + max_density_level = st->max_sst_density_level; + } + tidesdb_free_stats(st); + } + } + for (int i = 0; i < cf_count; i++) tidesdb_free(cf_names[i]); + tidesdb_free(cf_names); + + srv_stat_total_tombstones = (long long)total_tomb; + srv_stat_tombstone_ratio = total_keys > 0 ? (double)total_tomb / (double)total_keys : 0.0; + srv_stat_max_sst_density = max_density; + srv_stat_max_sst_density_level = (long long)max_density_level; + } +} + +/* ******************** Plugin declaration ******************** */ + +static struct st_mysql_storage_engine tidesdb_storage_engine = {MYSQL_HANDLERTON_INTERFACE_VERSION}; + +maria_declare_plugin(tidesdb){MYSQL_STORAGE_ENGINE_PLUGIN, + &tidesdb_storage_engine, + "TidesDB", + "TidesDB", + "LSM-tree engine with ACID transactions, MVCC concurrency, " + "secondary/spatial/full-text/vector indexes, and encryption", + PLUGIN_LICENSE_GPL, + tidesdb_init_func, + tidesdb_deinit_func, + TIDESQL_VERSION_HEX, + tidesdb_status_variables, + tidesdb_system_variables, + TIDESQL_VERSION_STR, + MariaDB_PLUGIN_MATURITY_GAMMA} maria_declare_plugin_end; diff --git a/storage/tidesdb/ha_tidesdb.h b/storage/tidesdb/ha_tidesdb.h new file mode 100644 index 0000000000000..694954cf4a9da --- /dev/null +++ b/storage/tidesdb/ha_tidesdb.h @@ -0,0 +1,1013 @@ +/* + Copyright (c) 2026 TidesDB Corp. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +/* my_global.h MUST be included before handler.h / my_base.h: handler.h pulls + in server headers that use typedefs (ulonglong, int64, sql_mode_t, ...) + defined by my_global.h. A wrong order breaks the build on MariaDB 11.4+ + with missing-declaration errors. The IncludeCategories rule in .clang-format + pins my_global.h to sort first so the formatter preserves this order. */ +#include "my_global.h" + +#include "handler.h" +#include "my_base.h" +#include "thr_lock.h" + +extern "C" +{ +#include +} + +/* Mirror constants for the library's TDB_DEFAULT_* values defined in + . We don't include that header directly because it + leaks a `realloc` macro that conflicts with MariaDB's String::realloc() + method. Keep these in sync with src/tidesdb.h on every library bump -- + sysvar defaults reference the TIDESQL_* names so drift is caught here + rather than scattered across the sysvar declarations. */ +static constexpr unsigned long long TIDESQL_DEFAULT_WRITE_BUFFER_SIZE = 64ULL * 1024 * 1024; +static constexpr unsigned long long TIDESQL_DEFAULT_SYNC_INTERVAL_US = 128000; +static constexpr unsigned long long TIDESQL_DEFAULT_KLOG_VALUE_THRESHOLD = 512; +static constexpr unsigned long long TIDESQL_DEFAULT_LEVEL_SIZE_RATIO = 10; +static constexpr unsigned long long TIDESQL_DEFAULT_MIN_LEVELS = 1; +static constexpr unsigned long long TIDESQL_DEFAULT_DIVIDING_LEVEL_OFFSET = 1; +static constexpr unsigned long long TIDESQL_DEFAULT_INDEX_SAMPLE_RATIO = 1; +static constexpr unsigned long long TIDESQL_DEFAULT_BLOCK_INDEX_PREFIX_LEN = 16; +static constexpr unsigned long long TIDESQL_DEFAULT_MIN_DISK_SPACE = 100ULL * 1024 * 1024; + +/* Key namespace prefixes (first byte of every TidesDB key) */ +static constexpr uint8_t KEY_NS_META = 0x00; +static constexpr uint8_t KEY_NS_DATA = 0x01; + +/* Size of the namespace prefix that every TidesDB key starts with. */ +static constexpr uint KEY_NAMESPACE_LEN = 1; + +/* Buffer size for a data CF key, namespace byte + comparable PK + 1 byte slack. + Used by every site that builds KEY_NS_DATA + pk via build_data_key. */ +static constexpr uint DATA_KEY_BUF_LEN = KEY_NAMESPACE_LEN + MAX_KEY_LENGTH + 1; + +/* Buffer size for a secondary-index CF entry key, comparable index-column + bytes (up to MAX_KEY_LENGTH) + appended PK bytes (up to MAX_KEY_LENGTH) + + 2 bytes of slack that covers VARBINARY length-byte overflow emitted + by make_comparable_key. */ +static constexpr uint SEC_IDX_KEY_BUF_LEN = (MAX_KEY_LENGTH * 2) + 2; + +/* Number of doubles in a 2-D minimum bounding rectangle. Always four + (xmin, ymin, xmax, ymax); used for the on-disk spatial value layout + and the in-memory query-MBR cache on the handler. */ +static constexpr uint SPATIAL_MBR_DIMS = 4; + +/* CF naming */ +static constexpr const char CF_INDEX_INFIX[] = "__idx_"; + +/* Reserved CF for schema discovery (object store mode only) */ +static constexpr const char SCHEMA_CF_NAME[] = "__tidesql_schema"; + +/* Hidden primary key size (tables without explicit PK) */ +static constexpr size_t HIDDEN_PK_SIZE = sizeof(uint64_t); + +/* Maximum number of secondary indexes we support */ +static constexpr uint MAX_TIDESDB_KEYS = MAX_KEY; + +/* Cost model constants for the optimizer */ +static constexpr double TIDESDB_COST_SEQ_READ = 0.00005; +static constexpr double TIDESDB_COST_KEY_READ = 0.00003; +static constexpr double TIDESDB_COST_RANGE_SETUP = 0.0001; +static constexpr double TIDESDB_DEFAULT_READ_AMP = 1.0; + +/* Stats cache refresh interval (microseconds) */ +static constexpr long long TIDESDB_STATS_REFRESH_US = 2000000LL; /* 2 seconds */ + +/* Minimum stats.records to avoid optimizer edge cases with 0 rows */ +static constexpr ha_rows TIDESDB_MIN_STATS_RECORDS = 2; + +/* scan_time() -- split the opaque cost returned by tidesdb_range_cost + between MariaDB's I/O and CPU cost buckets. LSM scans are mostly + block-read bound, so 90% I/O / 10% CPU matches observed profiles. */ +static constexpr double TIDESDB_SCAN_IO_WEIGHT = 0.9; +static constexpr double TIDESDB_SCAN_CPU_WEIGHT = 0.1; + +/* records_in_range() fallbacks when we can't get a useful estimate. */ +static constexpr ha_rows TIDESDB_RIR_DEFAULT_EST = 10; /* no share available */ +static constexpr ha_rows TIDESDB_RIR_UNKNOWN_DENOM = 4; /* total/4 + 1 quarter fallback */ +static constexpr double TIDESDB_RIR_FRACTION_UNRELIABLE = 0.8; /* fall back to rec_per_key */ + +/* Range-width multiplier applied to rec_per_key when tidesdb_range_cost + returned an unreliably high fraction (memtable-only data, narrow range + indistinguishable from full scan). Typical OLTP ranges span tens of + key values; 20 keeps the estimate tight while still being vastly + better than the full ratio. */ +static constexpr ha_rows TIDESDB_RIR_RANGE_RPK_MULTIPLIER = 20; + +/* Cap the rec_per_key range fallback at total / N so it never claims + more than this fraction of the table. */ +static constexpr ha_rows TIDESDB_RIR_RANGE_CAP_DENOM = 2; + +/* Sentinel bytes for building full-range bounds that pass through + tidesdb_range_cost or seek primitives. KEY_INF_HI_BYTE fills upper + bound buffers with 0xFF. KEY_INF_LO_BYTE seeds the smallest possible + first byte for secondary-index lower bounds (primary uses KEY_NS_DATA). */ +static constexpr uint8_t KEY_INF_HI_BYTE = 0xFF; +static constexpr uint8_t KEY_INF_LO_BYTE = 0x00; + +/* Row format constants. Every row written by serialize_row carries the + header [ROW_HEADER_MAGIC][null_bytes(2 LE)][field_count(2 LE)] for a + total of ROW_HEADER_SIZE bytes; deserialize_row reads them back to + support instant ADD/DROP COLUMN. */ +static constexpr uchar ROW_HEADER_MAGIC = 0xFE; +static constexpr uint ROW_HEADER_SIZE = 5; + +/* Length prefix Field::pack writes ahead of a wide VARCHAR payload. + Two bytes covers VARCHAR above 255 chars; narrower columns use a + single-byte prefix. */ +static constexpr uint FIELD_VARCHAR_LEN_PREFIX = 2; + +/* Sign-bit XOR mask used to translate a signed integer's MSB into + sortable form (and back). Big-endian sort keys flip this bit so + negative values sort below positive ones lexicographically. */ +static constexpr uint8_t INT_SORT_SIGN_FLIP_MASK = 0x80; + +/* MariaDB packed-field widths used by sort-key decoders. */ +static constexpr uint DATE_PACK_LEN = 3; +static constexpr uint DATETIME_MAX_PACK_LEN = 8; + +/* Sysvar enum index for tidesdb_object_store_backend. 0 = LOCAL, 1 = S3. */ +static constexpr uint OBJSTORE_BACKEND_LOCAL = 0; +static constexpr uint OBJSTORE_BACKEND_S3 = 1; + +/* Separator that joins db and table names when forming a TidesDB CF name + from a MariaDB path (e.g. "test/foo" -> "test__foo"). Centralized so + path_to_cf_name, schema_cf, and discover stay in sync. */ +static constexpr const char CF_DB_TABLE_SEP[] = "__"; + +/* Schema CF key encoding "db_nametable_name" with no trailing NUL. + The null byte separator is unambiguous because MariaDB identifiers + cannot contain NUL. Used by schema_cf_key, schema_cf_key_from_path, + the discover prefix builders, and the schema_cf_ensure_databases scan. */ +static constexpr char SCHEMA_CF_KEY_SEP = '\0'; + +/* MariaDB temp-table marker character. Internal temp/exchange tables + carry one or more '#' in their on-disk name (e.g. "#sql-..."); we + substitute '_' so the resulting CF name remains valid. */ +static constexpr char MARIADB_TEMP_NAME_MARKER = '#'; +static constexpr char MARIADB_TEMP_NAME_REPLACEMENT = '_'; + +/* Relative-path prefix that MariaDB prepends to table paths handed + to handler callbacks ("./db/table"). schema_cf_key_from_path and + path_to_cf_name strip it before extracting db/table components. */ +static constexpr const char MARIADB_REL_PATH_PREFIX[] = "./"; +static constexpr size_t MARIADB_REL_PATH_PREFIX_LEN = 2; + +/* MariaDB sort-key null-indicator bytes prepended to nullable key parts + in make_comparable_key. Convention 0 sorts NULLs first under memcmp, + 1 marks a present value. */ +static constexpr uchar SORT_KEY_NULL = 0; +static constexpr uchar SORT_KEY_NOT_NULL = 1; + +/* Slot indices into the 4-double MBR layout used by spatial_qmbr_ and + tdb_mbr_t-shaped buffers. Order matches the on-disk SPATIAL_MBR_VALUE_LEN + layout [xmin, ymin, xmax, ymax]. */ +static constexpr uint MBR_XMIN_IDX = 0; +static constexpr uint MBR_YMIN_IDX = 1; +static constexpr uint MBR_XMAX_IDX = 2; +static constexpr uint MBR_YMAX_IDX = 3; + +/* Inclusive bounds of the full 64-bit Hilbert value space. Used when a + spatial query has no decomposable cells (e.g. HA_READ_MBR_DISJOINT) and + we have to scan the entire curve. */ +static constexpr uint64_t HILBERT_RANGE_FULL_LO = 0; +static constexpr uint64_t HILBERT_RANGE_FULL_HI = UINT64_MAX; + +/* Minimum number of point ranges in a multi-range request before our + custom MRR path takes over from MariaDB's default implementation. + Single-range plans bypass MRR so pessimistic row locking still + engages on the index_read_map fast path. */ +static constexpr uint MRR_ACCEPT_MIN_RANGES = 2; + +/* Selectivity values used in info() / analyze() for index rec_per_key. + UNIQUE exactly one row per distinct value. FLOOR smallest plausible + estimate so the optimizer never sees rec_per_key=0 (treated as "unknown"). */ +static constexpr ulong REC_PER_KEY_UNIQUE = 1; +static constexpr ulong REC_PER_KEY_FLOOR = 1; + +/* Divisor used to compute the centroid of an MBR ((min + max) / 2) when + building a Hilbert spatial index key. The centroid is the point that + feeds hilbert_xy2d_64 -- the MBR corners themselves are stored in the + value, not the key. */ +static constexpr double MBR_CENTROID_DIV = 2.0; + +/* Multiplier used to convert a 0..1 ratio (cache hit rate, etc.) into + a percentage for human-readable status output. */ +static constexpr double PERCENT_SCALE = 100.0; + +/* First row id assigned to a freshly created (or fully truncated) + hidden-PK table. Row ids are one-based so that "0" remains a clean + sentinel for "no row id yet" / "uninitialized". */ +static constexpr uint64_t HIDDEN_PK_FIRST_ROW_ID = 1; + +/* Inclusive bounds of a probability / cost fraction in [0, 1]. Used to + clamp tidesdb_range_cost ratios in records_in_range so floating-point + noise from the cost estimator can't push the fraction outside its + semantic range. */ +static constexpr double FRACTION_MIN = 0.0; +static constexpr double FRACTION_MAX = 1.0; + +/* Read-amplification value reported when TidesDB has not yet collected + enough statistics to compute a real read_amp. 1.0 means "one disk op + per logical op" -- the optimistic baseline that won't penalize plans. */ +static constexpr double READ_AMP_NONE = 1.0; + +/* Per-document delta values for fts_update_meta when maintaining the + FTS metadata row alongside DML. ADD/DEL track whether a document + was inserted or removed; word-count deltas use the matching sign. */ +static constexpr int FTS_DOC_DELTA_ADD = 1; +static constexpr int FTS_DOC_DELTA_DEL = -1; + +/* mkdir mode used when the discover_table callback creates a missing + database directory under datadir. */ +static constexpr int TIDESDB_DB_DIR_MODE = 0755; + +/* Default ENCRYPTION_KEY_ID applied when a table is opened with + encryption enabled but no explicit key id is provided. Mirrors the + default in the ENCRYPTION_KEY_ID HA_TOPTION_NUMBER declaration. */ +static constexpr uint TIDESDB_DEFAULT_ENCRYPTION_KEY_ID = 1; + +/* Sentinel value stored in TidesDB_share::ttl_field_idx when no TTL + column is configured for the table. Valid TTL field indexes are + non-negative; >= 0 implies a TTL_COL column is present. */ +static constexpr int TIDESDB_TTL_FIELD_NONE = -1; + +/* Fallback divisor when rec_per_key is unset for a non-unique secondary + index in info(). Estimate is total_records / N, biasing toward more + selective lookups (10 ~= one decimal order of magnitude). */ +static constexpr ha_rows STATS_REC_PER_KEY_FALLBACK_DIVISOR = 10; + +/* IEEE-754 double-precision bit layout used by the spatial code's + lexicographic-orderable encoding. The sign bit is bit 63 of the 64-bit + representation; LEX_UINT32_HI_SHIFT extracts the high 32 bits after + sign-flipping for big-endian comparison. */ +static constexpr uint64_t IEEE754_DOUBLE_SIGN_MASK = (uint64_t)1 << 63; +static constexpr uint LEX_UINT32_HI_SHIFT = 32; + +/* Number of bits per byte for shift-based byte (de)serialization in the + spatial encoder/decoder loops. Equivalent to CHAR_BIT on POSIX. */ +static constexpr uint BITS_PER_BYTE = 8; + +/* yesno flag values used by the FTS boolean-mode parser to mark each + query term as required (`+term`), excluded (`-term`), or neutral + (just `term`). Compared with `> 0` and `< 0` in the BM25 reducer. */ +static constexpr int FTS_TERM_REQUIRED = 1; +static constexpr int FTS_TERM_EXCLUDED = -1; +static constexpr int FTS_TERM_NEUTRAL = 0; + +/* Operator characters recognized by fts_parse_boolean for queries + issued in `IN BOOLEAN MODE`. These are part of the MariaDB FTS + query DSL, not arbitrary punctuation. */ +static constexpr char FTS_BOOL_OP_REQUIRED = '+'; +static constexpr char FTS_BOOL_OP_EXCLUDED = '-'; +static constexpr char FTS_BOOL_OP_PHRASE = '"'; +static constexpr char FTS_BOOL_OP_TRUNC = '*'; + +/* BM25 (Okapi / Robertson Walker) ranking formula constants. + Used in ft_init_ext to score postings. IDF uses the Lucene + smoothed form, log((N - df + EPS) / (df + EPS) + SHIFT). TF + normalization uses (tf * (k1 + BOOST)) / (tf + k1 * (BASE - b + + b * dl / avgdl)). */ +static constexpr double BM25_IDF_EPSILON = 0.5; +static constexpr double BM25_IDF_NONNEG_SHIFT = 1.0; +static constexpr double BM25_TF_SATURATION_BOOST = 1.0; +static constexpr double BM25_LENGTH_NORM_BASE = 1.0; +/* Fallback average document length when the FTS metadata reports + zero total documents. A value of 1.0 collapses the length + normalization term to neutral so scoring still proceeds. */ +static constexpr double BM25_DEFAULT_AVGDL = 1.0; +/* Floor for total_docs in the IDF denominator. Guards std::log + from a divide-by-zero when no documents have been indexed yet. */ +static constexpr int64_t BM25_MIN_TOTAL_DOCS = 1; + +/* Inplace index builds rows between mid-txn commits and between + thd_killed polls. */ +static constexpr ha_rows TIDESDB_INDEX_BUILD_BATCH = 100; + +/* Bulk DML ops between mid-txn commits during start_bulk_insert / + start_bulk_update / start_bulk_delete. Counts both the primary put + and each secondary-index put. */ +static constexpr ha_rows TIDESDB_BULK_INSERT_BATCH_OPS = 500; + +/* Encryption */ +static constexpr uint TIDESDB_ENC_IV_LEN = 16; +static constexpr uint TIDESDB_ENC_KEY_LEN = 32; + +/* Bytes of key-version prefix on every encrypted row blob. The on-disk + layout is the 4-byte little-endian key version, then the IV, then the + ciphertext, so a row always decrypts under the exact key version it was + written with and survives an encryption key rotation. */ +static constexpr uint TIDESDB_ENC_VERSION_LEN = 4; + +/* Bloom filter FPR conversion (table option stores parts per 10000) */ +static constexpr double TIDESDB_BLOOM_FPR_DIVISOR = 10000.0; + +/* Tombstone density trigger conversion (table option stores parts per + 10000; library config is a 0.0..1.0 ratio). */ +static constexpr double TIDESDB_TOMBSTONE_DENSITY_DIVISOR = 10000.0; + +/* Skip list probability conversion (table option stores percentage) */ +static constexpr float TIDESDB_SKIP_LIST_PROB_DIV = 100.0f; + +/* TTL sentinel value meaning no expiration */ +static constexpr time_t TIDESDB_TTL_NONE = (time_t)-1; + +/* Default block cache size (bytes) */ +static constexpr ulonglong TIDESDB_DEFAULT_BLOCK_CACHE = 256ULL * 1024 * 1024; /* 256M */ + +/* + TidesDB_share -- shared state for one table, visible to all handler objects. +*/ +class TidesDB_share : public Handler_share +{ + public: + /* Main data CF */ + tidesdb_column_family_t *cf; + std::string cf_name; + + /* Primary key info */ + bool has_user_pk; + uint pk_index; /* MariaDB key number of the PK (usually 0) */ + uint pk_key_len; + + /* Hidden PK row-id generator (used when has_user_pk == false) */ + std::atomic next_row_id; + + /* In-memory AUTO_INCREMENT counter (avoids index_last() per INSERT). + Seeded once from index_last() at open time; incremented atomically. */ + std::atomic auto_inc_val{0}; + + /* Per-table isolation level (from CREATE TABLE options) */ + tidesdb_isolation_level_t isolation_level; + + /* TTL support */ + ulonglong default_ttl; /* table-level default TTL in seconds (0 = none) */ + int ttl_field_idx; /* field index of TTL_COL column (-1 = none) */ + + /* Data-at-rest encryption */ + bool encrypted; + uint encryption_key_id; /* ENCRYPTION_KEY_ID table option (default 1) */ + uint encryption_key_version; /* cached latest key version */ + + /* Cached table shape flags (set once at open time) */ + bool has_blobs; + bool has_ttl; + uint num_secondary_indexes; /* count of non-NULL secondary index CFs */ + size_t cached_row_est{0}; /* cached serialize_row size estimate for non-BLOB tables */ + + /* Field indices of BLOB/TEXT columns -- populated at open() when + has_blobs is true. serialize_row iterates only these instead of + scanning all fields for the BLOB_FLAG. */ + std::vector blob_field_indices; + + /* Per-field plan for the serialize/deserialize hot path. + Built once at open() so the row loops avoid per-row recomputation + of `f->ptr - table->record[0]` and skip the Field::pack/unpack + vtable dispatch for fields whose pack() is the default memcpy. + + memcpy_ok is true when the field's pack format is exactly + `pack_length()` bytes of memcpy (the Field::pack default, used by + all integer, FLOAT/DOUBLE, fixed DATETIME/DATE/TIME/TIMESTAMP, + YEAR, ENUM, SET, BIT and NEWDECIMAL types). CHAR/VARCHAR/BLOB/ + VARBINARY/GEOMETRY/JSON keep the slow path because their pack() + trims trailing spaces or emits a length prefix. + + maybe_null is cached so the loop branches off a single bool + instead of calling Field::real_maybe_null() per row. + + src_off is the field's offset within table->record[0] -- the loops + still rebase by ptrdiff at runtime so the same plan serves reads + and writes that target record[1] too. */ + struct field_plan_t + { + uint32 src_off; /* offset within table->record[0] */ + uint16 pack_len; /* f->pack_length(), used when memcpy_ok */ + bool memcpy_ok; /* true -> inline memcpy; false -> Field::pack */ + bool maybe_null; /* cached f->maybe_null() (NOT real_maybe_null) */ + }; + std::vector field_plan; + bool has_no_nullable{false}; + uint8 null_bytes_cached{0}; /* cached table->s->null_bytes */ + uint16 fields_cached{0}; /* cached table->s->fields */ + + /* Cached scan_time range cost (refreshed every TIDESDB_STATS_REFRESH_US) */ + std::atomic cached_scan_cost{0.0}; + std::atomic scan_cost_time{0}; + + /* records_in_range needs a full-range cost as the normalizer; without + a cache it recomputes that for every probe of every alternative + plan. Stored per CF -- one atomic for the data CF, one array per + secondary index -- refreshed with the same TIDESDB_STATS_REFRESH_US + window. std::atomic is not move-constructible so the + per-index storage uses a fixed unique_ptr sized in + open(). A stale read just produces a slightly stale estimate. */ + std::atomic cached_pk_full_cost{0.0}; + std::atomic cached_pk_full_cost_time{0}; + std::unique_ptr[]> cached_idx_full_cost; + std::unique_ptr[]> cached_idx_full_cost_time; + uint cached_idx_full_cost_n{0}; + + /* Table timestamps for information_schema.TABLES */ + time_t create_time{0}; /* from .frm stat at first open */ + std::atomic update_time{0}; /* bumped on DML (write/update/delete) */ + + /* Cached stats -- avoid expensive tidesdb_get_stats per statement. + Refreshed at most every 2 seconds; read with relaxed atomics. */ + std::atomic cached_records{0}; + std::atomic cached_data_size{0}; /* total_data_size from CF */ + std::atomic cached_idx_data_size{0}; /* sum of secondary CF sizes */ + std::atomic cached_mean_rec_len{0}; /* avg_key_size + avg_value_size */ + std::atomic stats_refresh_us{0}; + std::atomic cached_read_amp{1.0}; /* read amplification factor */ + + /* Precomputed comparable key length per index (avoids per-row recomputation) */ + uint idx_comp_key_len[MAX_KEY]; + + /* Precomputed index-type flags (avoid ki->algorithm dereference per row + in DML secondary-index loops). Populated at open() and refreshed + during online DDL. */ + bool idx_is_fts[MAX_KEY]; + bool idx_is_spatial[MAX_KEY]; + + /* Cached rec_per_key for secondary indexes (populated by ANALYZE TABLE). + 0 = not yet computed, use heuristic; >0 = sampled value. */ + std::atomic cached_rec_per_key[MAX_KEY]; + + /* Secondary index CFs (one per secondary key) */ + std::vector idx_cfs; + std::vector idx_cf_names; + + /* Per-index covered-field map used by try_keyread_from_index. For each + index i, idx_cover[i][field_c] == true when field `c` can be + reconstructed from the index key bytes (i.e. field is in the index's + key parts or -- for secondary indexes -- in the PK parts appended + to the key). Replaces the O(read_set_bits * (pk_parts + idx_parts)) + nested scan the old code did on every covered read. */ + std::vector> idx_cover; + + TidesDB_share(); + ~TidesDB_share(); +}; + +/* + Context passed between Online DDL phases (prepare -> inplace -> commit). + Holds the new/dropped CF pointers so commit can finalize atomically. +*/ +class ha_tidesdb_inplace_ctx : public inplace_alter_handler_ctx +{ + public: + /* CFs created for newly added indexes (populated during inplace phase) */ + std::vector add_cfs; + std::vector add_cf_names; + std::vector add_key_nums; /* position in new key_info */ + + /* CF names to drop for removed indexes (dropped during commit phase) */ + std::vector drop_cf_names; + + virtual ~ha_tidesdb_inplace_ctx() + { + } +}; + +/* Pessimistic lock mode. Shared is read-intent and compatible with itself, + exclusive is write-intent and conflicts with everything. Declared here + because tidesdb_trx_t carries a waiting_on_mode field. */ +enum tdb_lock_mode_t +{ + TDB_LOCK_MODE_S = 0, + TDB_LOCK_MODE_X = 1, +}; + +/* Per-txn accumulator entry for one FTS index's metadata key. The + plugin folds the per-row delta_docs and delta_words contributions + from every write_row / update_row / delete_row in a transaction here + and writes one combined update at commit time, so the FTS meta key + does not become a write-write serialisation point under concurrent + writers and a long statement does not produce N read-modify-writes + on the same key. */ +struct fts_meta_delta_t +{ + tidesdb_column_family_t *data_cf; + uint keynr; + int64_t doc_delta; + int64_t word_delta; +}; + +/* + Per-connection TidesDB transaction context. + Stored via thd_set_ha_data(); shared by all handler objects on the + same connection. The TidesDB txn spans the entire BEGIN...COMMIT + block (or a single auto-commit statement). +*/ +struct tidesdb_trx_t +{ + tidesdb_txn_t *txn{nullptr}; + bool dirty{false}; /* true once any DML uses txn */ + bool stmt_savepoint_active{false}; /* true while a "stmt" savepoint exists */ + bool needs_reset{false}; /* true after commit/rollback; cleared after txn_reset */ + tidesdb_isolation_level_t isolation_level{TDB_ISOLATION_REPEATABLE_READ}; + uint64_t txn_generation{0}; + + /* Plugin-level row lock state for this txn. The lock manager supports + shared (read-intent) and exclusive (write-intent) modes; multiple S + holders coexist on the same lock, X blocks any other holder, and a + new S blocks while an X is queued so writers are never starved by a + stream of readers. Locks are acquired from write_row, fetch_row_by_pk, + and iter_read_current depending on session isolation and write intent, + and released en masse at commit or rollback. */ + + struct tdb_lock_request_t *held_locks_head{nullptr}; + + /* What this txn is currently waiting for, published as two fields the + deadlock walker can read lock-free from other partitions without ever + dereferencing a request struct. Lock entries themselves are never + freed at runtime (find_or_create recycles empty slots in place), so + waiting_on_lock is always a safe pointer to follow. The writing + thread stores waiting_on_mode before waiting_on_lock with release + ordering, and walkers load waiting_on_lock with acquire then read the + mode, so a walker that sees a non-null lock pointer also sees the + matching mode. */ + std::atomic waiting_on_lock{nullptr}; + tdb_lock_mode_t waiting_on_mode{TDB_LOCK_MODE_S}; + + /* Per-statement FTS meta deltas, applied before tidesdb_commit hands + the txn to the library so the meta update lands in the same commit + as the row writes that produced it. */ + std::vector fts_meta_pending; + bool fts_meta_dirty{false}; +}; + +/* + ha_tidesdb -- per-connection handler object. +*/ +class ha_tidesdb : public handler +{ + TidesDB_share *share; + + /* Points into the per-connection tidesdb_trx_t::txn. + Set in external_lock(), cleared in external_lock(F_UNLCK). */ + tidesdb_txn_t *stmt_txn; + bool stmt_txn_dirty; /* true once any DML uses stmt_txn */ + + /* Scan / index-scan state (iterator lives on stmt_txn when available) */ + tidesdb_txn_t *scan_txn; + tidesdb_iter_t *scan_iter; + tidesdb_column_family_t *scan_cf_; /* CF for lazy iterator creation */ + tidesdb_column_family_t *scan_iter_cf_; /* CF the cached scan_iter was created for */ + tidesdb_txn_t *scan_iter_txn_; /* txn the cached scan_iter was created on */ + uint64_t scan_iter_txn_gen_; /* txn_generation when scan_iter was created */ + bool idx_pk_exact_done_; /* deferred seek after PK exact */ + enum scan_dir_t + { + DIR_NONE, + DIR_FORWARD, + DIR_BACKWARD + } scan_dir_; + std::string last_row; /* keeps BLOB data alive for record[0] */ + std::string last_row2; /* keeps BLOB data alive for record[1] */ + + /* Spatial index scan state */ + bool spatial_scan_active_{false}; + enum ha_rkey_function spatial_mode_ + { + HA_READ_KEY_EXACT + }; + double spatial_qmbr_[SPATIAL_MBR_DIMS]{}; /* query MBR (xmin, ymin, xmax, ymax) */ + + /* Hilbert range decomposition are sorted non-overlapping [lo, hi] ranges + covering the query box. spatial_range_idx_ tracks which range we're + currently scanning. */ + std::vector> spatial_ranges_; /* {lo, hi} */ + size_t spatial_range_idx_{0}; + + /* Spatial scan continuation -- scans Hilbert range with MBR post-filter */ + int spatial_scan_next(uchar *buf); + + /* Current row's PK key bytes (without namespace prefix). + Fixed buffer eliminates std::string heap allocation per row. */ + uchar current_pk_buf_[MAX_KEY_LENGTH]; + uint current_pk_len_; + + /* Reusable buffer for serialize_row (retains heap capacity) */ + std::string row_buf_; + + /* Cached comparable search key from index_read_map for index_next_same */ + uchar idx_search_comp_[MAX_KEY_LENGTH]; + uint idx_search_comp_len_; + + /* True when index_read_map landed on a partial-PK exact prefix scan and + defers iteration to index_next. index_next's PK branch must then + re-validate the prefix after each step, the same way the secondary + branch and index_next_same already do, or it would walk off the + prefix and return unrelated rows. */ + bool pk_partial_exact_active_{false}; + + /* Reusable buffers for secondary index key construction in update_row. + Avoids heap allocation per row and keeps the stack frame small. */ + uchar upd_old_ik_[SEC_IDX_KEY_BUF_LEN]; + uchar upd_new_ik_[SEC_IDX_KEY_BUF_LEN]; + + /* Cached dup-check iterators for UNIQUE secondary indexes. + tidesdb_iter_new() is O(num_sstables) -- caching avoids rebuilding + the merge heap on every INSERT for tables with unique indexes. */ + tidesdb_iter_t *dup_iter_cache_[MAX_KEY]; + tidesdb_txn_t *dup_iter_txn_[MAX_KEY]; /* txn each was created on */ + uint64_t dup_iter_txn_gen_[MAX_KEY]; /* txn_generation when created */ + uint dup_iter_count_; /* number of slots populated */ + + /* Reusable buffer for tidesdb_txn_get values -- avoids malloc/free per + point-lookup. Retains heap capacity across calls. */ + std::string get_val_buf_; + + /* Separate encryption output buffer so row_buf_ retains its heap + capacity across rows. serialize_row writes plaintext into row_buf_ + and the encrypted blob into enc_buf_. */ + std::string enc_buf_; + + /* Per-statement cached encryption key version -- avoids calling + encryption_key_get_latest_version() on every single row write. */ + uint cached_enc_key_ver_; + bool enc_key_ver_valid_; + + /* Per-statement cached time(NULL) -- avoids the vDSO/syscall on every + row for TTL computation. 1-second granularity is sufficient for TTL. */ + time_t cached_time_; + bool cached_time_valid_; + + /* Per-statement cached THDVAR lookups -- avoids the indirect + thd_get_ha_data + offset computation on every row. */ + ulonglong cached_sess_ttl_; + bool cached_skip_unique_; + bool cached_single_delete_primary_; + bool cached_thdvars_valid_; + + /* Write-lock mode -- set when store_lock detects FOR UPDATE / write intent. + Used to decide whether to acquire row locks in index_read_map. */ + bool stmt_has_write_lock_; + + /* True for UPDATE / DELETE statements -- set in external_lock(F_WRLCK) + from cached_sql_cmd_. iter_read_current uses this to skip the + per-row X lock during ICP filtering; update_row / delete_row + reacquire on the row they actually mutate. SELECT ... FOR UPDATE + leaves this false so the locking-cursor contract is preserved. */ + bool stmt_is_update_or_delete_{false}; + + /* Cached "is this scan on the primary key" flag. Set once in index_init + so the navigation methods (index_next/prev/first/last/next_same) skip + the per-row `share->has_user_pk && active_index == share->pk_index` + recomputation. */ + bool is_pk_; + + /* Cached last tidesdb_iter_new failure for the current scan CF/txn. + When non-zero and the scan_cf_/scan_txn haven't changed, ensure_scan_iter + returns the prior error immediately instead of retrying + re-logging. */ + int scan_iter_last_err_; + tidesdb_column_family_t *scan_iter_last_err_cf_; + tidesdb_txn_t *scan_iter_last_err_txn_; + + /* Handler mirrors of share->has_blobs / share->encrypted. Per-row + fetches and scans branch on these; reading them from a handler member + avoids the shared-memory dereference that dominates when the L1 line + for `share` isn't already hot. */ + bool has_blobs_; + bool encrypted_; + + /* Cached bounds of table->record[1] so the BLOB path of fetch_row_by_pk + and iter_read_current can classify `buf` against record[0] vs record[1] + without dereferencing `table->record[1]` and `table->s->reclength` on + every row. */ + const uchar *record1_lo_; + const uchar *record1_hi_; + + /* Cached per-statement THD query shape so ensure_stmt_txn() and + external_lock() don't each re-evaluate thd_sql_command() and + thd_test_options(). Populated by external_lock(F_WRLCK/F_RDLCK); + invalidated by external_lock(F_UNLCK) along with the other per-stmt + caches. */ + int cached_sql_cmd_; + bool cached_is_autocommit_; + bool cached_stmt_shape_valid_; + + /* Cached per-statement pointers to avoid repeated hash lookups. + Set in external_lock(lock), cleared in external_lock(F_UNLCK). + InnoDB caches these as m_user_thd / m_prebuilt->trx. */ + THD *cached_thd_; /* avoids ha_thd() virtual dispatch */ + tidesdb_trx_t *cached_trx_; /* avoids thd_get_ha_data() hash lookup */ + + /* Bulk DML state. The ops counter is shared across insert/update/delete + bulk modes since only one can be active at a time and they all use the + same TIDESDB_BULK_INSERT_BATCH_OPS threshold. */ + bool in_bulk_insert_; + bool in_bulk_update_; + bool in_bulk_delete_; + ha_rows bulk_insert_ops_; /* ops buffered since last mid-txn commit */ + + /* Auto-compact-after-range-delete tracking. When the session var + tidesdb_compact_after_range_delete_min_rows is non-zero, delete_row + updates these fields with the comparable PK bytes of each deleted + row, and end_bulk_delete fires tidesdb_compact_range over the + observed [min_pk, max_pk] range if the deleted-row count meets the + threshold. Cleared on start_bulk_delete and on each + cached-THDVAR refresh. */ + ulonglong cached_compact_after_range_delete_min_rows_; + ha_rows bulk_delete_rows_; + std::string bulk_delete_min_pk_; + std::string bulk_delete_max_pk_; + + /* Multi-Range Read state. We accept MRR when every range the optimizer + hands us is UNIQUE_RANGE|EQ_RANGE (i.e. the WHERE col IN (...) case on + a full key) and fall back to the default MRR->read_range_first path for + everything else. Accepted ranges are buffered + sorted by comparable + key bytes so the LSM sees a monotone stream of seeks. */ + struct tdb_mrr_entry + { + std::string comp_key; /* comparable PK / index bytes */ + range_id_t ptr; /* value returned to caller as *range_info */ + }; + bool mrr_custom_active_; + bool mrr_no_assoc_; + uint mrr_keyno_; + std::vector mrr_entries_; + size_t mrr_next_idx_; + + /* Covering-index mode (HA_EXTRA_KEYREAD) */ + bool keyread_only_; + bool write_can_replace_; /* true during REPLACE INTO (HA_EXTRA_WRITE_CAN_REPLACE) */ + + /* private helpers + */ + int ensure_stmt_txn(); /* lazy txn creation on first data access */ + TidesDB_share *get_share(); + const std::string &serialize_row(const uchar *buf); + void deserialize_row(uchar *buf, const uchar *data, size_t len); + void deserialize_row(uchar *buf, const std::string &row); + + /* Build memcmp-comparable key bytes into out[]; returns byte count */ + uint make_comparable_key(KEY *key_info, const uchar *record, uint num_parts, uchar *out); + + /* Convert key_copy-format search key directly to comparable bytes */ + uint key_copy_to_comparable(KEY *key_info, const uchar *key_buf, uint key_len, uchar *out); + + /* Build PK bytes from a record buffer into out[]; returns byte count */ + uint pk_from_record(const uchar *record, uchar *out); + + /* Build KEY_NS_DATA + pk into out[]; returns byte count */ + static uint build_data_key(const uchar *pk, uint pk_len, uchar *out) + { + out[0] = KEY_NS_DATA; + memcpy(out + KEY_NAMESPACE_LEN, pk, pk_len); + return pk_len + KEY_NAMESPACE_LEN; + } + + /* Build a secondary-index entry key into out[]; returns byte count */ + uint sec_idx_key(uint idx, const uchar *record, uchar *out); + + /* Fetch a row by its PK bytes into buf; sets current_pk + last_row */ + int fetch_row_by_pk(tidesdb_txn_t *txn, const uchar *pk, uint pk_len, uchar *buf); + + /* Compute the absolute TTL timestamp for a row being written. + Reads per-row TTL_COL value if present, else uses table default. + Returns -1 (no expiration) or a future Unix timestamp. */ + time_t compute_row_ttl(const uchar *buf); + + /* Read current iterator entry (data-CF), decode row into buf. + Returns 0 or HA_ERR_END_OF_FILE / HA_ERR_KEY_NOT_FOUND. */ + int iter_read_current(uchar *buf); + + /* Lazily create scan_iter from scan_cf_ when first needed */ + int ensure_scan_iter(); + + /* Try to decode record from secondary index key (keyread-only) */ + bool try_keyread_from_index(const uint8_t *ik, size_t iks, uint idx, uchar *buf); + + /* Evaluate pushed index condition on a secondary-index entry before + the expensive PK point-lookup. Decodes the index key columns into + buf and calls handler_index_cond_check(). + Returns CHECK_POS -- condition satisfied, proceed with PK lookup + CHECK_NEG -- condition not satisfied, skip this entry + CHECK_OUT_OF_RANGE -- past end of scan range + CHECK_ABORTED_BY_USER -- query killed */ + check_result_t icp_check_secondary(const uint8_t *ik, size_t iks, uint idx, uchar *buf); + + /* Reverse a single integer sort-key part back to native little-endian + at `to` (destination byte pointer computed once by the caller). + Returns true on success, false for unsupported sort_len. */ + static bool decode_int_sort_key(const uint8_t *src, uint sort_len, bool is_signed, uchar *to); + + /* Extended sort-key decoder -- handles integers, DATE, DATETIME, + TIMESTAMP, YEAR, and fixed-length CHAR/BINARY. Returns true on + success, false for unsupported types. Used by covering index + reads and ICP evaluation to avoid PK point-lookups. */ + static bool decode_sort_key_part(const uint8_t *src, uint sort_len, Field *f, uchar *buf); + + /* Free all cached dup-check iterators */ + void free_dup_iter_cache(); + + /* Commit the current txn mid-statement when a bulk op crosses the batch + threshold, then reset it to READ_COMMITTED for the next batch. Shared + between bulk INSERT/UPDATE/DELETE. Invalidates cached iterators. + Returns 0 on success, handler error code on fatal failure. */ + int maybe_bulk_commit(tidesdb_trx_t *trx); + + /* Recover hidden-PK counter by scanning the CF */ + void recover_counters(); + + public: + ha_tidesdb(handlerton *hton, TABLE_SHARE *table_arg); + ~ha_tidesdb() override = default; + + ulonglong table_flags() const override + { + return HA_BINLOG_STMT_CAPABLE | HA_BINLOG_ROW_CAPABLE | HA_NULL_IN_KEY | + HA_PRIMARY_KEY_IN_READ_INDEX | HA_TABLE_SCAN_ON_INDEX | HA_CAN_VIRTUAL_COLUMNS | + HA_FAST_KEY_READ | HA_REC_NOT_IN_SEQ | HA_CAN_SQL_HANDLER | + HA_REQUIRES_KEY_COLUMNS_FOR_DELETE | HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | + HA_ONLINE_ANALYZE | HA_CAN_ONLINE_BACKUPS | HA_CONCURRENT_OPTIMIZE | + HA_CAN_TABLES_WITHOUT_ROLLBACK | HA_CAN_FULLTEXT | HA_CAN_FULLTEXT_EXT | + HA_CAN_GEOMETRY | HA_CAN_RTREEKEYS | HA_CAN_EXPORT; + } + + ulong index_flags(uint idx, uint part, bool all_parts) const override; + + const char *index_type(uint key_number) override; + + uint max_supported_record_length() const override + { + return HA_MAX_REC_LENGTH; + } + uint max_supported_keys() const override + { + return MAX_TIDESDB_KEYS; + } + uint max_supported_key_parts() const override + { + return MAX_REF_PARTS; + } + uint max_supported_key_length() const override + { + return MAX_KEY_LENGTH; + } + + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, ulonglong blocks) override + { + /* Index read -- each point lookup touches read_amp levels. + Range scans amortize the merge-heap cost across rows. */ + IO_AND_CPU_COST cost; + cost.io = 0; + double amp = share ? share->cached_read_amp.load(std::memory_order_relaxed) + : TIDESDB_DEFAULT_READ_AMP; + cost.cpu = + (double)rows * TIDESDB_COST_KEY_READ * amp + (double)ranges * TIDESDB_COST_RANGE_SETUP; + return cost; + } + + IO_AND_CPU_COST rnd_pos_time(ha_rows rows) override + { + /* Random position lookup -- each is a point-get through LSM levels. + More expensive than sequential due to read amplification. */ + IO_AND_CPU_COST cost; + cost.io = 0; + double amp = share ? share->cached_read_amp.load(std::memory_order_relaxed) + : TIDESDB_DEFAULT_READ_AMP; + cost.cpu = (double)rows * TIDESDB_COST_SEQ_READ * amp; + return cost; + } + + /* Convert a MariaDB table path to a TidesDB column family name */ + static std::string path_to_cf_name(const char *path); + + /* DDL */ + int open(const char *name, int mode, uint test_if_locked) override; + int close(void) override; + int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info) override; + int delete_table(const char *name) override; + int rename_table(const char *from, const char *to) override; + + /* Full table scan */ + int rnd_init(bool scan) override; + int rnd_end() override; + int rnd_next(uchar *buf) override; + int rnd_pos(uchar *buf, uchar *pos) override; + void position(const uchar *record) override; + + /* Index scan */ + int index_init(uint idx, bool sorted) override; + int index_end() override; + int index_read_map(uchar *buf, const uchar *key, key_part_map keypart_map, + enum ha_rkey_function find_flag) override; + int index_next(uchar *buf) override; + int index_prev(uchar *buf) override; + int index_first(uchar *buf) override; + int index_last(uchar *buf) override; + int index_next_same(uchar *buf, const uchar *key, uint keylen) override; + + /* DML */ + int write_row(const uchar *buf) override; + int update_row(const uchar *old_data, const uchar *new_data) override; + int delete_row(const uchar *buf) override; + int delete_all_rows(void) override; + + /* Full-text search */ + int ft_init() override; + void ft_end() override; + FT_INFO *ft_init_ext(uint flags, uint inx, String *key) override; + int ft_read(uchar *buf) override; + + /* Bulk insert hint (LOAD DATA, multi-row INSERT) */ + void start_bulk_insert(ha_rows rows, uint flags) override; + int end_bulk_insert() override; + + /* Bulk UPDATE / DELETE hints -- let multi-row UPDATE/DELETE share the + same mid-txn commit batching as bulk INSERT so long statements don't + blow past TDB_MAX_TXN_OPS or balloon txn memory. */ + bool start_bulk_update() override; + int end_bulk_update() override; + int bulk_update_row(const uchar *old_data, const uchar *new_data, + ha_rows *dup_key_found) override; + bool start_bulk_delete() override; + int end_bulk_delete() override; + + /* Index Condition Pushdown (ICP) */ + Item *idx_cond_push(uint keyno, Item *idx_cond) override; + + /* Multi-Range Read (MRR). We opt into a custom implementation for + point-only range sequences and defer to the base handler for + everything else by leaving HA_MRR_USE_DEFAULT_IMPL set. */ + ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, void *seq_init_param, + uint n_ranges, uint *bufsz, uint *mrr_mode, ha_rows limit, + Cost_estimate *cost) override; + int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param, uint n_ranges, uint mrr_mode, + HANDLER_BUFFER *buf) override; + int multi_range_read_next(range_id_t *range_info) override; + + /* AUTO_INCREMENT -- O(1) atomic counter */ + void get_auto_increment(ulonglong offset, ulonglong increment, ulonglong nb_desired_values, + ulonglong *first_value, ulonglong *nb_reserved_values) override; + + /* Reset the in-memory auto-increment counter so `TRUNCATE TABLE t` and + `ALTER TABLE t AUTO_INCREMENT=N` take effect. Base default is a no-op, + which left TidesDB's cached counter running past TRUNCATE -- the next + INSERT would return a stale value instead of restarting at 1 (or N). */ + int reset_auto_increment(ulonglong value) override; + + /* Stats / Maintenance */ + int info(uint flag) override; + int analyze(THD *thd, HA_CHECK_OPT *check_opt) override; + int optimize(THD *thd, HA_CHECK_OPT *check_opt) override; + int check(THD *thd, HA_CHECK_OPT *check_opt) override; + int repair(THD *thd, HA_CHECK_OPT *check_opt) override; + ha_rows records_in_range(uint inx, const key_range *min_key, const key_range *max_key, + page_range *pages) override; + int extra(enum ha_extra_function operation) override; + + private: + public: + protected: + IO_AND_CPU_COST scan_time() override; + + public: + /* Locking -- TidesDB handles concurrency via MVCC internally. + lock_count()=0 bypasses MariaDB's THR_LOCK. */ + uint lock_count(void) const override + { + return 0; + } + int external_lock(THD *thd, int lock_type) override; + THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type) override; + + /* Online DDL -- instant metadata, inplace indexes, copy for columns */ + enum_alter_inplace_result check_if_supported_inplace_alter( + TABLE *altered_table, Alter_inplace_info *ha_alter_info) override; + bool prepare_inplace_alter_table(TABLE *altered_table, + Alter_inplace_info *ha_alter_info) override; + bool inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info) override; + bool commit_inplace_alter_table(TABLE *altered_table, Alter_inplace_info *ha_alter_info, + bool commit) override; + bool check_if_incompatible_data(HA_CREATE_INFO *create_info, uint table_changes) override; +}; diff --git a/storage/tidesdb/libtidesdb/external/ini.c b/storage/tidesdb/libtidesdb/external/ini.c new file mode 100644 index 0000000000000..08333cf0e1dcb --- /dev/null +++ b/storage/tidesdb/libtidesdb/external/ini.c @@ -0,0 +1,328 @@ +/* inih -- simple .INI file parser + +SPDX-License-Identifier: BSD-3-Clause + +Copyright (C) 2009-2025, Ben Hoyt + +inih is released under the New BSD license (see LICENSE.txt). Go to the project +home page for more info: + +https://github.com/benhoyt/inih + +*/ + +#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif + +#include +#include +#include + +#include "ini.h" + +#if !INI_USE_STACK +#if INI_CUSTOM_ALLOCATOR +#include +void* ini_malloc(size_t size); +void ini_free(void* ptr); +void* ini_realloc(void* ptr, size_t size); +#else +#include +#define ini_malloc malloc +#define ini_free free +#define ini_realloc realloc +#endif +#endif + +#define MAX_SECTION 50 +#define MAX_NAME 50 + +/* Used by ini_parse_string() to keep track of string parsing state. */ +typedef struct { + const char* ptr; + size_t num_left; +} ini_parse_string_ctx; + +/* Strip whitespace chars off end of given string, in place. end must be a + pointer to the NUL terminator at the end of the string. Return s. */ +static char* ini_rstrip(char* s, char* end) +{ + while (end > s && isspace((unsigned char)(*--end))) + *end = '\0'; + return s; +} + +/* Return pointer to first non-whitespace char in given string. */ +static char* ini_lskip(const char* s) +{ + while (*s && isspace((unsigned char)(*s))) + s++; + return (char*)s; +} + +/* Return pointer to first char (of chars) or inline comment in given string, + or pointer to NUL at end of string if neither found. Inline comment must + be prefixed by a whitespace character to register as a comment. */ +static char* ini_find_chars_or_comment(const char* s, const char* chars) +{ +#if INI_ALLOW_INLINE_COMMENTS + int was_space = 0; + while (*s && (!chars || !strchr(chars, *s)) && + !(was_space && strchr(INI_INLINE_COMMENT_PREFIXES, *s))) { + was_space = isspace((unsigned char)(*s)); + s++; + } +#else + while (*s && (!chars || !strchr(chars, *s))) { + s++; + } +#endif + return (char*)s; +} + +/* Similar to strncpy, but ensures dest (size bytes) is + NUL-terminated, and doesn't pad with NULs. */ +static char* ini_strncpy0(char* dest, const char* src, size_t size) +{ + /* Could use strncpy internally, but it causes gcc warnings (see issue #91) */ + size_t i; + for (i = 0; i < size - 1 && src[i]; i++) + dest[i] = src[i]; + dest[i] = '\0'; + return dest; +} + +/* See documentation in header file. */ +int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler, + void* user) +{ + /* Uses a fair bit of stack (use heap instead if you need to) */ +#if INI_USE_STACK + char line[INI_MAX_LINE]; + size_t max_line = INI_MAX_LINE; +#else + char* line; + size_t max_line = INI_INITIAL_ALLOC; +#endif +#if INI_ALLOW_REALLOC && !INI_USE_STACK + char* new_line; +#endif + char section[MAX_SECTION] = ""; +#if INI_ALLOW_MULTILINE + char prev_name[MAX_NAME] = ""; +#endif + + size_t offset; + char* start; + char* end; + char* name; + char* value; + int lineno = 0; + int error = 0; + char abyss[16]; /* Used to consume input when a line is too long. */ + size_t abyss_len; + +#if !INI_USE_STACK + line = (char*)ini_malloc(INI_INITIAL_ALLOC); + if (!line) { + return -2; + } +#endif + +#if INI_HANDLER_LINENO +#define HANDLER(u, s, n, v) handler(u, s, n, v, lineno) +#else +#define HANDLER(u, s, n, v) handler(u, s, n, v) +#endif + + /* Scan through stream line by line */ + while (reader(line, (int)max_line, stream) != NULL) { + offset = strlen(line); + +#if INI_ALLOW_REALLOC && !INI_USE_STACK + while (max_line < INI_MAX_LINE && + offset == max_line - 1 && line[offset - 1] != '\n') { + max_line *= 2; + if (max_line > INI_MAX_LINE) + max_line = INI_MAX_LINE; + new_line = ini_realloc(line, max_line); + if (!new_line) { + ini_free(line); + return -2; + } + line = new_line; + if (reader(line + offset, (int)(max_line - offset), stream) == NULL) + break; + offset += strlen(line + offset); + } +#endif + + lineno++; + + /* If line exceeded INI_MAX_LINE bytes, discard till end of line. */ + if (offset == max_line - 1 && line[offset - 1] != '\n') { + while (reader(abyss, sizeof(abyss), stream) != NULL) { + if (!error) + error = lineno; + abyss_len = strlen(abyss); + if (abyss_len > 0 && abyss[abyss_len - 1] == '\n') + break; + } + } + + start = line; +#if INI_ALLOW_BOM + if (lineno == 1 && (unsigned char)start[0] == 0xEF && + (unsigned char)start[1] == 0xBB && + (unsigned char)start[2] == 0xBF) { + start += 3; + } +#endif + start = ini_rstrip(ini_lskip(start), line + offset); + + if (strchr(INI_START_COMMENT_PREFIXES, *start)) { + /* Start-of-line comment */ + } +#if INI_ALLOW_MULTILINE + else if (*prev_name && *start && start > line) { +#if INI_ALLOW_INLINE_COMMENTS + end = ini_find_chars_or_comment(start, NULL); + *end = '\0'; + ini_rstrip(start, end); +#endif + /* Non-blank line with leading whitespace, treat as continuation + of previous name's value (as per Python configparser). */ + if (!HANDLER(user, section, prev_name, start) && !error) + error = lineno; + } +#endif + else if (*start == '[') { + /* A "[section]" line */ + end = ini_find_chars_or_comment(start + 1, "]"); + if (*end == ']') { + *end = '\0'; + ini_strncpy0(section, start + 1, sizeof(section)); +#if INI_ALLOW_MULTILINE + *prev_name = '\0'; +#endif +#if INI_CALL_HANDLER_ON_NEW_SECTION + if (!HANDLER(user, section, NULL, NULL) && !error) + error = lineno; +#endif + } + else if (!error) { + /* No ']' found on section line */ + error = lineno; + } + } + else if (*start) { + /* Not a comment, must be a name[=:]value pair */ + end = ini_find_chars_or_comment(start, "=:"); + if (*end == '=' || *end == ':') { + *end = '\0'; + name = ini_rstrip(start, end); + value = end + 1; +#if INI_ALLOW_INLINE_COMMENTS + end = ini_find_chars_or_comment(value, NULL); + *end = '\0'; +#endif + value = ini_lskip(value); + ini_rstrip(value, end); + +#if INI_ALLOW_MULTILINE + ini_strncpy0(prev_name, name, sizeof(prev_name)); +#endif + /* Valid name[=:]value pair found, call handler */ + if (!HANDLER(user, section, name, value) && !error) + error = lineno; + } + else { + /* No '=' or ':' found on name[=:]value line */ +#if INI_ALLOW_NO_VALUE + *end = '\0'; + name = ini_rstrip(start, end); + if (!HANDLER(user, section, name, NULL) && !error) + error = lineno; +#else + if (!error) + error = lineno; +#endif + } + } + +#if INI_STOP_ON_FIRST_ERROR + if (error) + break; +#endif + } + +#if !INI_USE_STACK + ini_free(line); +#endif + + return error; +} + +/* See documentation in header file. */ +int ini_parse_file(FILE* file, ini_handler handler, void* user) +{ + return ini_parse_stream((ini_reader)fgets, file, handler, user); +} + +/* See documentation in header file. */ +int ini_parse(const char* filename, ini_handler handler, void* user) +{ + FILE* file; + int error; + + file = fopen(filename, "r"); + if (!file) + return -1; + error = ini_parse_file(file, handler, user); + fclose(file); + return error; +} + +/* An ini_reader function to read the next line from a string buffer. This + is the fgets() equivalent used by ini_parse_string(). */ +static char* ini_reader_string(char* str, int num, void* stream) { + ini_parse_string_ctx* ctx = (ini_parse_string_ctx*)stream; + const char* ctx_ptr = ctx->ptr; + size_t ctx_num_left = ctx->num_left; + char* strp = str; + char c; + + if (ctx_num_left == 0 || num < 2) + return NULL; + + while (num > 1 && ctx_num_left != 0) { + c = *ctx_ptr++; + ctx_num_left--; + *strp++ = c; + if (c == '\n') + break; + num--; + } + + *strp = '\0'; + ctx->ptr = ctx_ptr; + ctx->num_left = ctx_num_left; + return str; +} + +/* See documentation in header file. */ +int ini_parse_string(const char* string, ini_handler handler, void* user) { + return ini_parse_string_length(string, strlen(string), handler, user); +} + +/* See documentation in header file. */ +int ini_parse_string_length(const char* string, size_t length, + ini_handler handler, void* user) { + ini_parse_string_ctx ctx; + + ctx.ptr = string; + ctx.num_left = length; + return ini_parse_stream((ini_reader)ini_reader_string, &ctx, handler, + user); +} diff --git a/storage/tidesdb/libtidesdb/external/ini.h b/storage/tidesdb/libtidesdb/external/ini.h new file mode 100644 index 0000000000000..07aa7f48f0cdd --- /dev/null +++ b/storage/tidesdb/libtidesdb/external/ini.h @@ -0,0 +1,189 @@ +/* inih -- simple .INI file parser + +SPDX-License-Identifier: BSD-3-Clause + +Copyright (C) 2009-2025, Ben Hoyt + +inih is released under the New BSD license (see LICENSE.txt). Go to the project +home page for more info: + +https://github.com/benhoyt/inih + +*/ + +#ifndef INI_H +#define INI_H + +/* Make this header file easier to include in C++ code */ +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Nonzero if ini_handler callback should accept lineno parameter. */ +#ifndef INI_HANDLER_LINENO +#define INI_HANDLER_LINENO 0 +#endif + +/* Visibility symbols, required for Windows DLLs */ +#ifndef INI_API +#if defined _WIN32 || defined __CYGWIN__ +# ifdef INI_SHARED_LIB +# ifdef INI_SHARED_LIB_BUILDING +# define INI_API __declspec(dllexport) +# else +# define INI_API __declspec(dllimport) +# endif +# else +# define INI_API +# endif +#else +# if defined(__GNUC__) && __GNUC__ >= 4 +# define INI_API __attribute__ ((visibility ("default"))) +# else +# define INI_API +# endif +#endif +#endif + +/* Typedef for prototype of handler function. + + Note that even though the value parameter has type "const char*", the user + may cast to "char*" and modify its content, as the value is not used again + after the call to ini_handler. This is not true of section and name -- + those must not be modified. +*/ +#if INI_HANDLER_LINENO +typedef int (*ini_handler)(void* user, const char* section, + const char* name, const char* value, + int lineno); +#else +typedef int (*ini_handler)(void* user, const char* section, + const char* name, const char* value); +#endif + +/* Typedef for prototype of fgets-style reader function. */ +typedef char* (*ini_reader)(char* str, int num, void* stream); + +/* Parse given INI-style file. May have [section]s, name=value pairs + (whitespace stripped), and comments starting with ';' (semicolon). Section + is "" if name=value pair parsed before any section heading. name:value + pairs are also supported as a concession to Python's configparser. + + For each name=value pair parsed, call handler function with given user + pointer as well as section, name, and value (data only valid for duration + of handler call). Handler should return nonzero on success, zero on error. + + Returns 0 on success, line number of first error on parse error (doesn't + stop on first error), -1 on file open error, or -2 on memory allocation + error (only when INI_USE_STACK is zero). +*/ +INI_API int ini_parse(const char* filename, ini_handler handler, void* user); + +/* Same as ini_parse(), but takes a FILE* instead of filename. This doesn't + close the file when it's finished -- the caller must do that. */ +INI_API int ini_parse_file(FILE* file, ini_handler handler, void* user); + +/* Same as ini_parse(), but takes an ini_reader function pointer instead of + filename. Used for implementing custom or string-based I/O (see also + ini_parse_string). */ +INI_API int ini_parse_stream(ini_reader reader, void* stream, ini_handler handler, + void* user); + +/* Same as ini_parse(), but takes a zero-terminated string with the INI data + instead of a file. Useful for parsing INI data from a network socket or + which is already in memory. */ +INI_API int ini_parse_string(const char* string, ini_handler handler, void* user); + +/* Same as ini_parse_string(), but takes a string and its length, avoiding + strlen(). Useful for parsing INI data from a network socket or which is + already in memory, or interfacing with C++ std::string_view. */ +INI_API int ini_parse_string_length(const char* string, size_t length, ini_handler handler, void* user); + +/* Nonzero to allow multi-line value parsing, in the style of Python's + configparser. If allowed, ini_parse() will call the handler with the same + name for each subsequent line parsed. */ +#ifndef INI_ALLOW_MULTILINE +#define INI_ALLOW_MULTILINE 1 +#endif + +/* Nonzero to allow a UTF-8 BOM sequence (0xEF 0xBB 0xBF) at the start of + the file. See https://github.com/benhoyt/inih/issues/21 */ +#ifndef INI_ALLOW_BOM +#define INI_ALLOW_BOM 1 +#endif + +/* Chars that begin a start-of-line comment. Per Python configparser, allow + both ; and # comments at the start of a line by default. */ +#ifndef INI_START_COMMENT_PREFIXES +#define INI_START_COMMENT_PREFIXES ";#" +#endif + +/* Nonzero to allow inline comments (with valid inline comment characters + specified by INI_INLINE_COMMENT_PREFIXES). Set to 0 to turn off and match + Python 3.2+ configparser behaviour. */ +#ifndef INI_ALLOW_INLINE_COMMENTS +#define INI_ALLOW_INLINE_COMMENTS 1 +#endif +#ifndef INI_INLINE_COMMENT_PREFIXES +#define INI_INLINE_COMMENT_PREFIXES ";" +#endif + +/* Nonzero to use stack for line buffer, zero to use heap (malloc/free). */ +#ifndef INI_USE_STACK +#define INI_USE_STACK 1 +#endif + +/* Maximum line length for any line in INI file (stack or heap). Note that + this must be 3 more than the longest line (due to '\r', '\n', and '\0'). */ +#ifndef INI_MAX_LINE +#define INI_MAX_LINE 200 +#endif + +/* Nonzero to allow heap line buffer to grow via realloc(), zero for a + fixed-size buffer of INI_MAX_LINE bytes. Only applies if INI_USE_STACK is + zero. */ +#ifndef INI_ALLOW_REALLOC +#define INI_ALLOW_REALLOC 0 +#endif + +/* Initial size in bytes for heap line buffer. Only applies if INI_USE_STACK + is zero. */ +#ifndef INI_INITIAL_ALLOC +#define INI_INITIAL_ALLOC 200 +#endif + +/* Stop parsing on first error (default is to keep parsing). */ +#ifndef INI_STOP_ON_FIRST_ERROR +#define INI_STOP_ON_FIRST_ERROR 0 +#endif + +/* Nonzero to call the handler at the start of each new section (with + name and value NULL). Default is to only call the handler on + each name=value pair. */ +#ifndef INI_CALL_HANDLER_ON_NEW_SECTION +#define INI_CALL_HANDLER_ON_NEW_SECTION 0 +#endif + +/* Nonzero to allow a name without a value (no '=' or ':' on the line) and + call the handler with value NULL in this case. Default is to treat + no-value lines as an error. */ +#ifndef INI_ALLOW_NO_VALUE +#define INI_ALLOW_NO_VALUE 0 +#endif + +/* Nonzero to use custom ini_malloc, ini_free, and ini_realloc memory + allocation functions (INI_USE_STACK must also be 0). These functions must + have the same signatures as malloc/free/realloc and behave in a similar + way. ini_realloc is only needed if INI_ALLOW_REALLOC is set. */ +#ifndef INI_CUSTOM_ALLOCATOR +#define INI_CUSTOM_ALLOCATOR 0 +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* INI_H */ diff --git a/storage/tidesdb/libtidesdb/external/uthash.h b/storage/tidesdb/libtidesdb/external/uthash.h new file mode 100644 index 0000000000000..32a6513206ac5 --- /dev/null +++ b/storage/tidesdb/libtidesdb/external/uthash.h @@ -0,0 +1,1335 @@ +/* +Copyright (c) 2003-2025, Troy D. Hanson https://troydhanson.github.io/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTHASH_H +#define UTHASH_H + +#define UTHASH_VERSION 2.3.0 + +#include /* ptrdiff_t */ +#include /* exit */ +#include /* memcmp, memset, strlen */ + +#if defined(HASH_NO_STDINT) && HASH_NO_STDINT +/* The user doesn't have , and must figure out their own way + to provide definitions for uint8_t and uint32_t. */ +#else +#include /* uint8_t, uint32_t */ +#endif + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#if !defined(DECLTYPE) && !defined(NO_DECLTYPE) +#if defined(_MSC_VER) /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#endif +#elif defined(__MCST__) /* Elbrus C Compiler */ +#define DECLTYPE(x) (__typeof(x)) +#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) || defined(__WATCOMC__) +#define NO_DECLTYPE +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE(x) +#define DECLTYPE_ASSIGN(dst, src) \ + do \ + { \ + char** _da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ + } while (0) +#else +#define DECLTYPE_ASSIGN(dst, src) \ + do \ + { \ + (dst) = DECLTYPE(dst)(src); \ + } while (0) +#endif + +#ifndef uthash_malloc +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#endif +#ifndef uthash_free +#define uthash_free(ptr, sz) free(ptr) /* free fcn */ +#endif +#ifndef uthash_bzero +#define uthash_bzero(a, n) memset(a, '\0', n) +#endif +#ifndef uthash_strlen +#define uthash_strlen(s) strlen(s) +#endif + +#ifndef HASH_FUNCTION +#define HASH_FUNCTION(keyptr, keylen, hashv) HASH_JEN(keyptr, keylen, hashv) +#endif + +#ifndef HASH_KEYCMP +#define HASH_KEYCMP(a, b, n) memcmp(a, b, n) +#endif + +#ifndef uthash_noexpand_fyi +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#endif +#ifndef uthash_expand_fyi +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ +#endif + +#ifndef HASH_NONFATAL_OOM +#define HASH_NONFATAL_OOM 0 +#endif + +#if HASH_NONFATAL_OOM +/* malloc failures can be recovered from */ + +#ifndef uthash_nonfatal_oom +#define uthash_nonfatal_oom(obj) \ + do \ + { \ + } while (0) /* non-fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) \ + do \ + { \ + (oomed) = 1; \ + } while (0) +#define IF_HASH_NONFATAL_OOM(x) x + +#else +/* malloc failures result in lost memory, hash tables are unusable */ + +#ifndef uthash_fatal +#define uthash_fatal(msg) exit(-1) /* fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory") +#define IF_HASH_NONFATAL_OOM(x) + +#endif + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32U /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5U /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10U /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhp */ +#define ELMT_FROM_HH(tbl, hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) +/* calculate the hash handle from element address elp */ +#define HH_FROM_ELMT(tbl, elp) ((UT_hash_handle*)(void*)(((char*)(elp)) + ((tbl)->hho))) + +#define HASH_ROLLBACK_BKT(hh, head, itemptrhh) \ + do \ + { \ + struct UT_hash_handle* _hd_hh_item = (itemptrhh); \ + unsigned _hd_bkt; \ + HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + (head)->hh.tbl->buckets[_hd_bkt].count++; \ + _hd_hh_item->hh_next = NULL; \ + _hd_hh_item->hh_prev = NULL; \ + } while (0) + +#define HASH_VALUE(keyptr, keylen, hashv) \ + do \ + { \ + HASH_FUNCTION(keyptr, keylen, hashv); \ + } while (0) + +#define HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, hashval, out) \ + do \ + { \ + (out) = NULL; \ + if (head) \ + { \ + unsigned _hf_bkt; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, hashval)) \ + { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[_hf_bkt], keyptr, \ + keylen, hashval, out); \ + } \ + } \ + } while (0) + +#define HASH_FIND(hh, head, keyptr, keylen, out) \ + do \ + { \ + (out) = NULL; \ + if (head) \ + { \ + unsigned _hf_hashv; \ + HASH_VALUE(keyptr, keylen, _hf_hashv); \ + HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out); \ + } \ + } while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN \ + (HASH_BLOOM_BITLEN / 8UL) + (((HASH_BLOOM_BITLEN % 8UL) != 0UL) ? 1UL : 0UL) +#define HASH_BLOOM_MAKE(tbl, oomed) \ + do \ + { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!(tbl)->bloom_bv) \ + { \ + HASH_RECORD_OOM(oomed); \ + } \ + else \ + { \ + uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ + } \ + } while (0) + +#define HASH_BLOOM_FREE(tbl) \ + do \ + { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + } while (0) + +#define HASH_BLOOM_BITSET(bv, idx) (bv[(idx) / 8U] |= (1U << ((idx) % 8U))) +#define HASH_BLOOM_BITTEST(bv, idx) ((bv[(idx) / 8U] & (1U << ((idx) % 8U))) != 0) + +#define HASH_BLOOM_ADD(tbl, hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#define HASH_BLOOM_TEST(tbl, hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#else +#define HASH_BLOOM_MAKE(tbl, oomed) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl, hashv) +#define HASH_BLOOM_TEST(tbl, hashv) 1 +#define HASH_BLOOM_BYTELEN 0U +#endif + +#define HASH_MAKE_TABLE(hh, head, oomed) \ + do \ + { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc(sizeof(UT_hash_table)); \ + if (!(head)->hh.tbl) \ + { \ + HASH_RECORD_OOM(oomed); \ + } \ + else \ + { \ + uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ + if (!(head)->hh.tbl->buckets) \ + { \ + HASH_RECORD_OOM(oomed); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } \ + else \ + { \ + uthash_bzero((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl, oomed); \ + IF_HASH_NONFATAL_OOM(if (oomed) { \ + uthash_free((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + }) \ + } \ + } \ + } while (0) + +#define HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, hashval, add, replaced, \ + cmpfcn) \ + do \ + { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) \ + { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, \ + add, cmpfcn); \ + } while (0) + +#define HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, hashval, add, replaced) \ + do \ + { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) \ + { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add); \ + } while (0) + +#define HASH_REPLACE(hh, head, fieldname, keylen_in, add, replaced) \ + do \ + { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced); \ + } while (0) + +#define HASH_REPLACE_INORDER(hh, head, fieldname, keylen_in, add, replaced, cmpfcn) \ + do \ + { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced, \ + cmpfcn); \ + } while (0) + +#define HASH_APPEND_LIST(hh, head, add) \ + do \ + { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail->next = (add); \ + (head)->hh.tbl->tail = &((add)->hh); \ + } while (0) + +#define HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn) \ + do \ + { \ + do \ + { \ + if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) \ + { \ + break; \ + } \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ + } while (0) + +#ifdef NO_DECLTYPE +#undef HASH_AKBI_INNER_LOOP +#define HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn) \ + do \ + { \ + char* _hs_saved_head = (char*)(head); \ + do \ + { \ + DECLTYPE_ASSIGN(head, _hs_iter); \ + if (cmpfcn(head, add) > 0) \ + { \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + break; \ + } \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ + } while (0) +#endif + +#if HASH_NONFATAL_OOM + +#define HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, oomed) \ + do \ + { \ + if (!(oomed)) \ + { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + if (oomed) \ + { \ + HASH_ROLLBACK_BKT(hh, head, &(add)->hh); \ + HASH_DELETE_HH(hh, head, &(add)->hh); \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } \ + else \ + { \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } \ + } \ + else \ + { \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } \ + } while (0) + +#else + +#define HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, oomed) \ + do \ + { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } while (0) + +#endif + +#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, hashval, add, cmpfcn) \ + do \ + { \ + IF_HASH_NONFATAL_OOM(int _ha_oomed = 0;) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (char*)(keyptr); \ + (add)->hh.keylen = (unsigned)(keylen_in); \ + if (!(head)) \ + { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM(if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( \ + }) \ + } \ + else \ + { \ + void* _hs_iter = (head); \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn); \ + if (_hs_iter) \ + { \ + (add)->hh.next = _hs_iter; \ + if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) \ + { \ + HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add); \ + } \ + else \ + { \ + (head) = (add); \ + } \ + HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add); \ + } \ + else \ + { \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER"); \ + } while (0) + +#define HASH_ADD_KEYPTR_INORDER(hh, head, keyptr, keylen_in, add, cmpfcn) \ + do \ + { \ + unsigned _hs_hashv; \ + HASH_VALUE(keyptr, keylen_in, _hs_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, _hs_hashv, add, cmpfcn); \ + } while (0) + +#define HASH_ADD_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, hashval, add, cmpfcn) \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, \ + cmpfcn) + +#define HASH_ADD_INORDER(hh, head, fieldname, keylen_in, add, cmpfcn) \ + HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn) + +#define HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, hashval, add) \ + do \ + { \ + IF_HASH_NONFATAL_OOM(int _ha_oomed = 0;) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (const void*)(keyptr); \ + (add)->hh.keylen = (unsigned)(keylen_in); \ + if (!(head)) \ + { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM(if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( \ + }) \ + } \ + else \ + { \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE"); \ + } while (0) + +#define HASH_ADD_KEYPTR(hh, head, keyptr, keylen_in, add) \ + do \ + { \ + unsigned _ha_hashv; \ + HASH_VALUE(keyptr, keylen_in, _ha_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add); \ + } while (0) + +#define HASH_ADD_BYHASHVALUE(hh, head, fieldname, keylen_in, hashval, add) \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add) + +#define HASH_ADD(hh, head, fieldname, keylen_in, add) \ + HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add) + +#define HASH_TO_BKT(hashv, num_bkts, bkt) \ + do \ + { \ + bkt = ((hashv) & ((num_bkts) - 1U)); \ + } while (0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh, head, delptr) HASH_DELETE_HH(hh, head, &(delptr)->hh) + +#define HASH_DELETE_HH(hh, head, delptrhh) \ + do \ + { \ + const struct UT_hash_handle* _hd_hh_del = (delptrhh); \ + if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) \ + { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } \ + else \ + { \ + unsigned _hd_bkt; \ + if (_hd_hh_del == (head)->hh.tbl->tail) \ + { \ + (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev); \ + } \ + if (_hd_hh_del->prev != NULL) \ + { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next = _hd_hh_del->next; \ + } \ + else \ + { \ + DECLTYPE_ASSIGN(head, _hd_hh_del->next); \ + } \ + if (_hd_hh_del->next != NULL) \ + { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev = _hd_hh_del->prev; \ + } \ + HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh, head, "HASH_DELETE_HH"); \ + } while (0) + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head, findstr, out) \ + do \ + { \ + unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr); \ + HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out); \ + } while (0) +#define HASH_ADD_STR(head, strfield, add) \ + do \ + { \ + unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add); \ + } while (0) +#define HASH_REPLACE_STR(head, strfield, add, replaced) \ + do \ + { \ + unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced); \ + } while (0) +#define HASH_FIND_INT(head, findint, out) HASH_FIND(hh, head, findint, sizeof(int), out) +#define HASH_ADD_INT(head, intfield, add) HASH_ADD(hh, head, intfield, sizeof(int), add) +#define HASH_REPLACE_INT(head, intfield, add, replaced) \ + HASH_REPLACE(hh, head, intfield, sizeof(int), add, replaced) +#define HASH_FIND_PTR(head, findptr, out) HASH_FIND(hh, head, findptr, sizeof(void*), out) +#define HASH_ADD_PTR(head, ptrfield, add) HASH_ADD(hh, head, ptrfield, sizeof(void*), add) +#define HASH_REPLACE_PTR(head, ptrfield, add, replaced) \ + HASH_REPLACE(hh, head, ptrfield, sizeof(void*), add, replaced) +#define HASH_DEL(head, delptr) HASH_DELETE(hh, head, delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#include /* fprintf, stderr */ +#define HASH_OOPS(...) \ + do \ + { \ + fprintf(stderr, __VA_ARGS__); \ + exit(-1); \ + } while (0) +#define HASH_FSCK(hh, head, where) \ + do \ + { \ + struct UT_hash_handle* _thh; \ + if (head) \ + { \ + unsigned _bkt_i; \ + unsigned _count = 0; \ + char* _prev; \ + for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) \ + { \ + unsigned _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) \ + { \ + if (_prev != (char*)(_thh->hh_prev)) \ + { \ + HASH_OOPS("%s: invalid hh_prev %p, actual %p\n", (where), \ + (void*)_thh->hh_prev, (void*)_prev); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) \ + { \ + HASH_OOPS("%s: invalid bucket count %u, actual %u\n", (where), \ + (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) \ + { \ + HASH_OOPS("%s: invalid hh item count %u, actual %u\n", (where), \ + (head)->hh.tbl->num_items, _count); \ + } \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) \ + { \ + _count++; \ + if (_prev != (char*)_thh->prev) \ + { \ + HASH_OOPS("%s: invalid prev %p, actual %p\n", (where), (void*)_thh->prev, \ + (void*)_prev); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL); \ + } \ + if (_count != (head)->hh.tbl->num_items) \ + { \ + HASH_OOPS("%s: invalid app item count %u, actual %u\n", (where), \ + (head)->hh.tbl->num_items, _count); \ + } \ + } \ + } while (0) +#else +#define HASH_FSCK(hh, head, where) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh, head, keyptr, fieldlen) \ + do \ + { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen); \ + } while (0) +#else +#define HASH_EMIT_KEY(hh, head, keyptr, fieldlen) +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */ +#define HASH_BER(key, keylen, hashv) \ + do \ + { \ + unsigned _hb_keylen = (unsigned)keylen; \ + const unsigned char* _hb_key = (const unsigned char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen-- != 0U) \ + { \ + (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; \ + } \ + } while (0) + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx + * (archive link: https://archive.is/Ivcan ) + */ +#define HASH_SAX(key, keylen, hashv) \ + do \ + { \ + unsigned _sx_i; \ + const unsigned char* _hs_key = (const unsigned char*)(key); \ + hashv = 0; \ + for (_sx_i = 0; _sx_i < keylen; _sx_i++) \ + { \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + } \ + } while (0) +/* FNV-1a variation */ +#define HASH_FNV(key, keylen, hashv) \ + do \ + { \ + unsigned _fn_i; \ + const unsigned char* _hf_key = (const unsigned char*)(key); \ + (hashv) = 2166136261U; \ + for (_fn_i = 0; _fn_i < keylen; _fn_i++) \ + { \ + hashv = hashv ^ _hf_key[_fn_i]; \ + hashv = hashv * 16777619U; \ + } \ + } while (0) + +#define HASH_OAT(key, keylen, hashv) \ + do \ + { \ + unsigned _ho_i; \ + const unsigned char* _ho_key = (const unsigned char*)(key); \ + hashv = 0; \ + for (_ho_i = 0; _ho_i < keylen; _ho_i++) \ + { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ + } while (0) + +#define HASH_JEN_MIX(a, b, c) \ + do \ + { \ + a -= b; \ + a -= c; \ + a ^= (c >> 13); \ + b -= c; \ + b -= a; \ + b ^= (a << 8); \ + c -= a; \ + c -= b; \ + c ^= (b >> 13); \ + a -= b; \ + a -= c; \ + a ^= (c >> 12); \ + b -= c; \ + b -= a; \ + b ^= (a << 16); \ + c -= a; \ + c -= b; \ + c ^= (b >> 5); \ + a -= b; \ + a -= c; \ + a ^= (c >> 3); \ + b -= c; \ + b -= a; \ + b ^= (a << 10); \ + c -= a; \ + c -= b; \ + c ^= (b >> 15); \ + } while (0) + +#define HASH_JEN(key, keylen, hashv) \ + do \ + { \ + unsigned _hj_i, _hj_j, _hj_k; \ + unsigned const char* _hj_key = (unsigned const char*)(key); \ + hashv = 0xfeedbeefu; \ + _hj_i = _hj_j = 0x9e3779b9u; \ + _hj_k = (unsigned)(keylen); \ + while (_hj_k >= 12U) \ + { \ + _hj_i += (_hj_key[0] + ((unsigned)_hj_key[1] << 8) + ((unsigned)_hj_key[2] << 16) + \ + ((unsigned)_hj_key[3] << 24)); \ + _hj_j += (_hj_key[4] + ((unsigned)_hj_key[5] << 8) + ((unsigned)_hj_key[6] << 16) + \ + ((unsigned)_hj_key[7] << 24)); \ + hashv += (_hj_key[8] + ((unsigned)_hj_key[9] << 8) + ((unsigned)_hj_key[10] << 16) + \ + ((unsigned)_hj_key[11] << 24)); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12U; \ + } \ + hashv += (unsigned)(keylen); \ + switch (_hj_k) \ + { \ + case 11: \ + hashv += ((unsigned)_hj_key[10] << 24); /* FALLTHROUGH */ \ + case 10: \ + hashv += ((unsigned)_hj_key[9] << 16); /* FALLTHROUGH */ \ + case 9: \ + hashv += ((unsigned)_hj_key[8] << 8); /* FALLTHROUGH */ \ + case 8: \ + _hj_j += ((unsigned)_hj_key[7] << 24); /* FALLTHROUGH */ \ + case 7: \ + _hj_j += ((unsigned)_hj_key[6] << 16); /* FALLTHROUGH */ \ + case 6: \ + _hj_j += ((unsigned)_hj_key[5] << 8); /* FALLTHROUGH */ \ + case 5: \ + _hj_j += _hj_key[4]; /* FALLTHROUGH */ \ + case 4: \ + _hj_i += ((unsigned)_hj_key[3] << 24); /* FALLTHROUGH */ \ + case 3: \ + _hj_i += ((unsigned)_hj_key[2] << 16); /* FALLTHROUGH */ \ + case 2: \ + _hj_i += ((unsigned)_hj_key[1] << 8); /* FALLTHROUGH */ \ + case 1: \ + _hj_i += _hj_key[0]; /* FALLTHROUGH */ \ + default:; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + } while (0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) || defined(_MSC_VER) || \ + defined(__BORLANDC__) || defined(__TURBOC__) +#define get16bits(d) (*((const uint16_t*)(d))) +#endif + +#if !defined(get16bits) +#define get16bits(d) \ + ((((uint32_t)(((const uint8_t*)(d))[1])) << 8) + (uint32_t)(((const uint8_t*)(d))[0])) +#endif +#define HASH_SFH(key, keylen, hashv) \ + do \ + { \ + unsigned const char* _sfh_key = (unsigned const char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen; \ + \ + unsigned _sfh_rem = _sfh_len & 3U; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabeu; \ + \ + /* Main loop */ \ + for (; _sfh_len > 0U; _sfh_len--) \ + { \ + hashv += get16bits(_sfh_key); \ + _sfh_tmp = ((uint32_t)(get16bits(_sfh_key + 2)) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2U * sizeof(uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) \ + { \ + case 3: \ + hashv += get16bits(_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= (uint32_t)(_sfh_key[sizeof(uint16_t)]) << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: \ + hashv += get16bits(_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: \ + hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + break; \ + default:; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ + } while (0) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl, hh, head, keyptr, keylen_in, hashval, out) \ + do \ + { \ + if ((head).hh_head != NULL) \ + { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head)); \ + } \ + else \ + { \ + (out) = NULL; \ + } \ + while ((out) != NULL) \ + { \ + if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) \ + { \ + if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) \ + { \ + break; \ + } \ + } \ + if ((out)->hh.hh_next != NULL) \ + { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next)); \ + } \ + else \ + { \ + (out) = NULL; \ + } \ + } \ + } while (0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head, hh, addhh, oomed) \ + do \ + { \ + UT_hash_bucket* _ha_head = &(head); \ + _ha_head->count++; \ + (addhh)->hh_next = _ha_head->hh_head; \ + (addhh)->hh_prev = NULL; \ + if (_ha_head->hh_head != NULL) \ + { \ + _ha_head->hh_head->hh_prev = (addhh); \ + } \ + _ha_head->hh_head = (addhh); \ + if ((_ha_head->count >= ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) && \ + !(addhh)->tbl->noexpand) \ + { \ + HASH_EXPAND_BUCKETS(addhh, (addhh)->tbl, oomed); \ + IF_HASH_NONFATAL_OOM(if (oomed) { HASH_DEL_IN_BKT(head, addhh); }) \ + } \ + } while (0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(head, delhh) \ + do \ + { \ + UT_hash_bucket* _hd_head = &(head); \ + _hd_head->count--; \ + if (_hd_head->hh_head == (delhh)) \ + { \ + _hd_head->hh_head = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_prev) \ + { \ + (delhh)->hh_prev->hh_next = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_next) \ + { \ + (delhh)->hh_next->hh_prev = (delhh)->hh_prev; \ + } \ + } while (0) + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(hh, tbl, oomed) \ + do \ + { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc(sizeof(struct UT_hash_bucket) * \ + (tbl)->num_buckets * 2U); \ + if (!_he_new_buckets) \ + { \ + HASH_RECORD_OOM(oomed); \ + } \ + else \ + { \ + uthash_bzero(_he_new_buckets, \ + sizeof(struct UT_hash_bucket) * (tbl)->num_buckets * 2U); \ + (tbl)->ideal_chain_maxlen = \ + ((tbl)->num_items >> ((tbl)->log2_num_buckets + 1U)) + \ + ((((tbl)->num_items & (((tbl)->num_buckets * 2U) - 1U)) != 0U) ? 1U : 0U); \ + (tbl)->nonideal_items = 0; \ + for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) \ + { \ + _he_thh = (tbl)->buckets[_he_bkt_i].hh_head; \ + while (_he_thh != NULL) \ + { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[_he_bkt]); \ + if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) \ + { \ + (tbl)->nonideal_items++; \ + if (_he_newbkt->count > \ + _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) \ + { \ + _he_newbkt->expand_mult++; \ + } \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head != NULL) \ + { \ + _he_newbkt->hh_head->hh_prev = _he_thh; \ + } \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free((tbl)->buckets, (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + (tbl)->num_buckets *= 2U; \ + (tbl)->log2_num_buckets++; \ + (tbl)->buckets = _he_new_buckets; \ + (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) \ + ? ((tbl)->ineff_expands + 1U) \ + : 0U; \ + if ((tbl)->ineff_expands > 1U) \ + { \ + (tbl)->noexpand = 1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ + } \ + } while (0) + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head, cmpfcn) HASH_SRT(hh, head, cmpfcn) +#define HASH_SRT(hh, head, cmpfcn) \ + do \ + { \ + unsigned _hs_i; \ + unsigned _hs_looping, _hs_nmerges, _hs_insize, _hs_psize, _hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head != NULL) \ + { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping != 0U) \ + { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p != NULL) \ + { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) \ + { \ + _hs_psize++; \ + _hs_q = ((_hs_q->next != NULL) ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) \ + : NULL); \ + if (_hs_q == NULL) \ + { \ + break; \ + } \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize != 0U) || ((_hs_qsize != 0U) && (_hs_q != NULL))) \ + { \ + if (_hs_psize == 0U) \ + { \ + _hs_e = _hs_q; \ + _hs_q = \ + ((_hs_q->next != NULL) ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) \ + : NULL); \ + _hs_qsize--; \ + } \ + else if ((_hs_qsize == 0U) || (_hs_q == NULL)) \ + { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) \ + { \ + _hs_p = ((_hs_p->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) \ + : NULL); \ + } \ + _hs_psize--; \ + } \ + else if ((cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_q)))) <= \ + 0) \ + { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) \ + { \ + _hs_p = ((_hs_p->next != NULL) \ + ? HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) \ + : NULL); \ + } \ + _hs_psize--; \ + } \ + else \ + { \ + _hs_e = _hs_q; \ + _hs_q = \ + ((_hs_q->next != NULL) ? HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) \ + : NULL); \ + _hs_qsize--; \ + } \ + if (_hs_tail != NULL) \ + { \ + _hs_tail->next = \ + ((_hs_e != NULL) ? ELMT_FROM_HH((head)->hh.tbl, _hs_e) : NULL); \ + } \ + else \ + { \ + _hs_list = _hs_e; \ + } \ + if (_hs_e != NULL) \ + { \ + _hs_e->prev = \ + ((_hs_tail != NULL) ? ELMT_FROM_HH((head)->hh.tbl, _hs_tail) \ + : NULL); \ + } \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + if (_hs_tail != NULL) \ + { \ + _hs_tail->next = NULL; \ + } \ + if (_hs_nmerges <= 1U) \ + { \ + _hs_looping = 0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2U; \ + } \ + HASH_FSCK(hh, head, "HASH_SRT"); \ + } \ + } while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ + do \ + { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt = NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh = NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if ((src) != NULL) \ + { \ + for (_src_bkt = 0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) \ + { \ + for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; _src_hh != NULL; \ + _src_hh = _src_hh->hh_next) \ + { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) \ + { \ + IF_HASH_NONFATAL_OOM(int _hs_oomed = 0;) \ + _dst_hh = (UT_hash_handle*)(void*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh != NULL) \ + { \ + _last_elt_hh->next = _elt; \ + } \ + if ((dst) == NULL) \ + { \ + DECLTYPE_ASSIGN(dst, _elt); \ + HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed); \ + IF_HASH_NONFATAL_OOM(if (_hs_oomed) { \ + uthash_nonfatal_oom(_elt); \ + (dst) = NULL; \ + continue; \ + }) \ + } \ + else \ + { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh, \ + _hs_oomed); \ + (dst)->hh_dst.tbl->num_items++; \ + IF_HASH_NONFATAL_OOM(if (_hs_oomed) { \ + HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh); \ + HASH_DELETE_HH(hh_dst, dst, _dst_hh); \ + _dst_hh->tbl = NULL; \ + uthash_nonfatal_oom(_elt); \ + continue; \ + }) \ + HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv); \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst, dst, "HASH_SELECT"); \ + } while (0) + +#define HASH_CLEAR(hh, head) \ + do \ + { \ + if ((head) != NULL) \ + { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } \ + } while (0) + +#define HASH_OVERHEAD(hh, head) \ + (((head) != NULL) ? ((size_t)(((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ + ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ + sizeof(UT_hash_table) + (HASH_BLOOM_BYTELEN))) \ + : 0U) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh, head, el, tmp) \ + for (((el) = (head)), \ + ((*(char**)(&(tmp))) = (char*)((head != NULL) ? (head)->hh.next : NULL)); \ + (el) != NULL; \ + ((el) = (tmp)), ((*(char**)(&(tmp))) = (char*)((tmp != NULL) ? (tmp)->hh.next : NULL))) +#else +#define HASH_ITER(hh, head, el, tmp) \ + for (((el) = (head)), ((tmp) = DECLTYPE(el)((head != NULL) ? (head)->hh.next : NULL)); \ + (el) != NULL; \ + ((el) = (tmp)), ((tmp) = DECLTYPE(el)((tmp != NULL) ? (tmp)->hh.next : NULL))) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh, head) +#define HASH_CNT(hh, head) ((head != NULL) ? ((head)->hh.tbl->num_items) : 0U) + +typedef struct UT_hash_bucket +{ + struct UT_hash_handle* hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1u +#define HASH_BLOOM_SIGNATURE 0xb12220f2u + +typedef struct UT_hash_table +{ + UT_hash_bucket* buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle* tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t* bloom_bv; + uint8_t bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle +{ + struct UT_hash_table* tbl; + void* prev; /* prev element in app order */ + void* next; /* next element in app order */ + struct UT_hash_handle* hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle* hh_next; /* next hh in bucket order */ + const void* key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ diff --git a/storage/tidesdb/libtidesdb/external/xxhash.c b/storage/tidesdb/libtidesdb/external/xxhash.c new file mode 100644 index 0000000000000..e60cc37f13c27 --- /dev/null +++ b/storage/tidesdb/libtidesdb/external/xxhash.c @@ -0,0 +1,42 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2023 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ + +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/storage/tidesdb/libtidesdb/external/xxhash.h b/storage/tidesdb/libtidesdb/external/xxhash.h new file mode 100644 index 0000000000000..78fc2e8dbf6db --- /dev/null +++ b/storage/tidesdb/libtidesdb/external/xxhash.h @@ -0,0 +1,7238 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2023 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include + * #include + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * + * @anchor canonical_representation_example + * **Canonical Representation** + * + * The default return values from XXH functions are unsigned 32, 64 and 128 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + * + * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(), + * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(), + * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(), + * + * @code{.c} + * #include + * #include "xxhash.h" + * + * // Example for a function which prints XXH32_hash_t in human readable format + * void printXxh32(XXH32_hash_t hash) + * { + * XXH32_canonical_t cano; + * XXH32_canonicalFromHash(&cano, hash); + * size_t i; + * for(i = 0; i < sizeof(cano.digest); ++i) { + * printf("%02x", cano.digest[i]); + * } + * printf("\n"); + * } + * + * // Example for a function which converts XXH32_canonical_t to XXH32_hash_t + * XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano) + * { + * XXH32_hash_t hash = XXH32_hashFromCanonical(&cano); + * return hash; + * } + * @endcode + * + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ + +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ + +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((__unused__)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((__const__)) +# define XXH_PUREF __attribute__((__pure__)) +# define XXH_MALLOCF __attribute__((__malloc__)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 3 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# ifdef _AIX +# include +# else +# include +# endif + typedef uint32_t XXH32_hash_t; + +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit xxHash32 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + * @see @ref streaming_example "Streaming Example" + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * @return An allocated pointer of @ref XXH32_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH32_freeState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH32_createState(). + * + * @see @ref streaming_example "Streaming Example" + * + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 32-bit xxHash32 value from that state. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! @cond Doxygen ignores this part */ +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((__noescape__)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# ifdef _AIX +# include +# else +# include +# endif + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit xxHash64 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + * @see @ref streaming_example "Streaming Example" + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * @return An allocated pointer of @ref XXH64_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH64_freeState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Frees an @ref XXH64_state_t. + * + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * + * @return @ref XXH_OK. + * + * @note @p statePtr must be allocated with XXH64_createState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note This function resets and seeds a state. Call it before @ref XXH64_update(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated 64-bit xxHash64 value from that state. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generate exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ + +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Unless set explicitly, determined automatically. + */ +# define XXH_SCALAR 0 /*!< Portable scalar version */ +# define XXH_SSE2 1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */ +# define XXH_AVX2 2 /*!< AVX2 for Haswell and Bulldozer */ +# define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */ +# define XXH_NEON 4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */ +# define XXH_VSX 5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */ +# define XXH_SVE 6 /*!< SVE for some ARMv8-A and ARMv9-A */ +# define XXH_LSX 7 /*!< LSX (128-bit SIMD) for LoongArch64 */ + + +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief Calculates 64-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 64-bit XXH3 hash value. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The opaque state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + * @see @ref streaming_example "Streaming Example" + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits()`. + * + * @see @ref streaming_example "Streaming Example" + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call this function before @ref XXH3_64bits_update(). + * - Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + * @see @ref streaming_example "Streaming Example" + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * `secret` is referenced, it _must outlive_ the hash streaming session. + * + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note Call this to incrementally consume blocks of data. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Calculates 128-bit unseeded variant of XXH3 of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p length bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * This variant generates a custom secret on the fly based on default secret + * altered using the @p seed value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! + * @brief Calculates 128-bit variant of XXH3 with a custom "secret". + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @return The calculated 128-bit variant of XXH3 value. + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing @ref XXH3_generateSecret() instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret with default parameters. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits()`. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * - This function resets `statePtr` and generate a secret from `seed`. + * - Call it before @ref XXH3_128bits_update(). + * - Digest will be equivalent to `XXH3_128bits_withSeed()`. + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr The state struct to reset. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @note + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * @brief Check equality of two XXH128_hash_t values + * + * @param h1 The 128-bit hash value. + * @param h2 Another 128-bit hash value. + * + * @return `1` if `h1` and `h2` are equal. + * @return `0` if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @param h128_1 Left-hand side value + * @param h128_2 Right-hand side value + * + * @return >0 if @p h128_1 > @p h128_2 + * @return =0 if @p h128_1 == @p h128_2 + * @return <0 if @p h128_1 < @p h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + * @see @ref canonical_representation_example "Canonical Representation Example" + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t acc[4]; /*!< Accumulator lanes */ + unsigned char buffer[16]; /*!< Internal buffer for partial reads. */ + XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t acc[4]; /*!< Accumulator lanes */ + unsigned char buffer[32]; /*!< Internal buffer for partial reads.. */ + XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# define XXH_ALIGN(n) _Alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @internal + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) + + +/*! + * @brief Calculates the 128-bit hash of @p data using XXH3. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p data and @p data + @p len must be valid, + * readable, contiguous memory. However, if @p len is `0`, @p data may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 128-bit XXH3 value. + * + * @see @ref single_shot_example "Single Shot Example" for an example. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * @brief Derive a high-entropy secret from any user-defined content, named customSeed. + * + * @param secretBuffer A writable buffer for derived high-entropy secret data. + * @param secretSize Size of secretBuffer, in bytes. Must be >= XXH3_SECRET_SIZE_MIN. + * @param customSeed A user-defined content. + * @param customSeedSize Size of customSeed, in bytes. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include + * #include + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes + * @param seed The 64-bit seed to alter the hash result predictably. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_DEFAULT_SIZE]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * @brief Maximum size of "short" key in bytes. + */ +#define XXH3_MIDSIZE_MAX 240 + +/*! + * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The block of data to be hashed, at least @p len bytes in size. + * @param len The length of @p data, in bytes. + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * These variants generate hash values using either: + * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes) + * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); + +/*! + * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data. + * + * @param data The memory segment to be hashed, at least @p len bytes in size. + * @param length The length of @p data, in bytes. + * @param secret The secret used to alter hash result predictably. + * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN) + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed(): contract is the same. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); + +#ifndef XXH_NO_STREAM +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed(). Contract is identical. + */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); + +/*! + * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @param secret The secret data. + * @param secretSize The length of @p secret, in bytes. + * @param seed64 The 64-bit seed to alter the hash result predictably. + * + * @return @ref XXH_OK on success. + * @return @ref XXH_ERROR on failure. + * + * @see XXH3_64bits_withSecretandSeed(). Contract is identical. + * + * Note: there was a bug in an earlier version of this function (<= v0.8.2) + * that would make it generate an incorrect hash value + * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX + * and @p secret is different from XXH3_generateSecret_fromSeed(). + * As stated in the contract, the correct hash result must be + * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX. + * Results generated by this older version are wrong, hence not comparable. + */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); + +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((__unused__)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__)) +# define XXH_NO_INLINE static __attribute__((__noinline__)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + +#if defined(XXH_INLINE_ALL) +# define XXH_STATIC XXH_FORCE_INLINE +#else +# define XXH_STATIC static +#endif + +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# ifdef _AIX +# include +# else +# include +# endif + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((__aligned__(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `` header. + * We don't use that as including `` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +#elif XXH_HAS_BUILTIN(__builtin_stdc_rotate_left) +# define XXH_rotl32 __builtin_stdc_rotate_left +# define XXH_rotl64 __builtin_stdc_rotate_left +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is used to prevent GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Sets up the initial accumulator state for XXH32(). + */ +XXH_FORCE_INLINE void +XXH32_initAccs(xxh_u32 *acc, xxh_u32 seed) +{ + XXH_ASSERT(acc != NULL); + acc[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + acc[1] = seed + XXH_PRIME32_2; + acc[2] = seed + 0; + acc[3] = seed - XXH_PRIME32_1; +} + +/*! + * @internal + * @brief Consumes a block of data for XXH32(). + * + * @return the end input pointer. + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH32_consumeLong( + xxh_u32 *XXH_RESTRICT acc, + xxh_u8 const *XXH_RESTRICT input, + size_t len, + XXH_alignment align +) +{ + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + XXH_ASSERT(acc != NULL); + XXH_ASSERT(input != NULL); + XXH_ASSERT(len >= 16); + do { + acc[0] = XXH32_round(acc[0], XXH_get32bits(input)); input += 4; + acc[1] = XXH32_round(acc[1], XXH_get32bits(input)); input += 4; + acc[2] = XXH32_round(acc[2], XXH_get32bits(input)); input += 4; + acc[3] = XXH32_round(acc[3], XXH_get32bits(input)); input += 4; + } while (input < limit); + + return input; +} + +/*! + * @internal + * @brief Merges the accumulator lanes together for XXH32() + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_mergeAccs(const xxh_u32 *acc) +{ + XXH_ASSERT(acc != NULL); + return XXH_rotl32(acc[0], 1) + XXH_rotl32(acc[1], 7) + + XXH_rotl32(acc[2], 12) + XXH_rotl32(acc[3], 18); +} + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + xxh_u32 acc[4]; + XXH32_initAccs(acc, seed); + + input = XXH32_consumeLong(acc, input, len, align); + + h32 = XXH32_mergeAccs(acc); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + XXH32_initAccs(statePtr->acc, seed); + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + XXH_ASSERT(state->bufferedSize < sizeof(state->buffer)); + if (len < sizeof(state->buffer) - state->bufferedSize) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + { const xxh_u8* xinput = (const xxh_u8*)input; + const xxh_u8* const bEnd = xinput + len; + + if (state->bufferedSize) { /* non-empty buffer: complete first */ + XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize); + xinput += sizeof(state->buffer) - state->bufferedSize; + /* then process one round */ + (void)XXH32_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned); + state->bufferedSize = 0; + } + + XXH_ASSERT(xinput <= bEnd); + if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) { + /* Process the remaining data */ + xinput = XXH32_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned); + } + + if (xinput < bEnd) { + /* Copy the leftover to the tmp buffer */ + XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput)); + state->bufferedSize = (unsigned)(bEnd-xinput); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH32_mergeAccs(state->acc); + } else { + h32 = state->acc[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, state->buffer, state->bufferedSize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((__aligned__(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; +#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * DISABLE AUTOVECTORIZATION: + * A compiler fence is used to prevent GCC and Clang from + * autovectorizing the XXH64 loop (pragmas and attributes don't work for some + * reason) without globally disabling AVX512. + * + * Autovectorization of XXH64 tends to be detrimental, + * though the exact outcome may change depending on exact cpu and compiler version. + * For information, it has been reported as detrimental for Skylake-X, + * but possibly beneficial for Zen4. + * + * The default is to disable auto-vectorization, + * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Sets up the initial accumulator state for XXH64(). + */ +XXH_FORCE_INLINE void +XXH64_initAccs(xxh_u64 *acc, xxh_u64 seed) +{ + XXH_ASSERT(acc != NULL); + acc[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + acc[1] = seed + XXH_PRIME64_2; + acc[2] = seed + 0; + acc[3] = seed - XXH_PRIME64_1; +} + +/*! + * @internal + * @brief Consumes a block of data for XXH64(). + * + * @return the end input pointer. + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH64_consumeLong( + xxh_u64 *XXH_RESTRICT acc, + xxh_u8 const *XXH_RESTRICT input, + size_t len, + XXH_alignment align +) +{ + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + XXH_ASSERT(acc != NULL); + XXH_ASSERT(input != NULL); + XXH_ASSERT(len >= 32); + do { + /* reroll on 32-bit */ + if (sizeof(void *) < sizeof(xxh_u64)) { + size_t i; + for (i = 0; i < 4; i++) { + acc[i] = XXH64_round(acc[i], XXH_get64bits(input)); + input += 8; + } + } else { + acc[0] = XXH64_round(acc[0], XXH_get64bits(input)); input += 8; + acc[1] = XXH64_round(acc[1], XXH_get64bits(input)); input += 8; + acc[2] = XXH64_round(acc[2], XXH_get64bits(input)); input += 8; + acc[3] = XXH64_round(acc[3], XXH_get64bits(input)); input += 8; + } + } while (input < limit); + + return input; +} + +/*! + * @internal + * @brief Merges the accumulator lanes together for XXH64() + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_mergeAccs(const xxh_u64 *acc) +{ + XXH_ASSERT(acc != NULL); + { + xxh_u64 h64 = XXH_rotl64(acc[0], 1) + XXH_rotl64(acc[1], 7) + + XXH_rotl64(acc[2], 12) + XXH_rotl64(acc[3], 18); + /* reroll on 32-bit */ + if (sizeof(void *) < sizeof(xxh_u64)) { + size_t i; + for (i = 0; i < 4; i++) { + h64 = XXH64_mergeRound(h64, acc[i]); + } + } else { + h64 = XXH64_mergeRound(h64, acc[0]); + h64 = XXH64_mergeRound(h64, acc[1]); + h64 = XXH64_mergeRound(h64, acc[2]); + h64 = XXH64_mergeRound(h64, acc[3]); + } + return h64; + } +} + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +XXH_STATIC XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { /* Process a large block of data */ + xxh_u64 acc[4]; + XXH64_initAccs(acc, seed); + + input = XXH64_consumeLong(acc, input, len, align); + + h64 = XXH64_mergeAccs(acc); + } else { + h64 = seed + XXH_PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + XXH64_initAccs(statePtr->acc, seed); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + state->total_len += len; + + XXH_ASSERT(state->bufferedSize <= sizeof(state->buffer)); + if (len < sizeof(state->buffer) - state->bufferedSize) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + { const xxh_u8* xinput = (const xxh_u8*)input; + const xxh_u8* const bEnd = xinput + len; + + if (state->bufferedSize) { /* non-empty buffer => complete first */ + XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize); + xinput += sizeof(state->buffer) - state->bufferedSize; + /* and process one round */ + (void)XXH64_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned); + state->bufferedSize = 0; + } + + XXH_ASSERT(xinput <= bEnd); + if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) { + /* Process the remaining data */ + xinput = XXH64_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned); + } + + if (xinput < bEnd) { + /* Copy the leftover to the tmp buffer */ + XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput)); + state->bufferedSize = (unsigned)(bEnd-xinput); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH64_mergeAccs(state->acc); + } else { + h64 = state->acc[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, state->buffer, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +/* + * Not defined as XXH_HAS_INCLUDE(x) (function-like) because + * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion) + */ +# define XXH_HAS_INCLUDE __has_include +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include +# endif +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* WASM SIMD128 via SIMDe */ +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# elif defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__loongarch_sx) +# include +# endif +#endif + +#if defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE()) /* wasm simd128 via SIMDe */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# elif defined(__loongarch_sx) +# define XXH_VECTOR XXH_LSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_LSX /* lsx */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((__may_alias__)) +#else +# define XXH_ALIASING /* nothing */ +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + +#if XXH_VECTOR == XXH_NEON + +/* + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. + * + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif + +/*! + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif + +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * This can be set to 2, 4, 6, or 8. + * + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. + * + * This is even more noticeable on the more advanced cores like the Cortex-A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include +# else +# include +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= PRIME_MX1; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= PRIME_MX2; + h64 ^= (h64 >> 35) + len ; + h64 *= PRIME_MX2; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_() and it calls + * XXH3_accumulate_512_(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON and WASM SIMD128. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. + */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; + + size_t i; +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); + } + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + + size_t i; + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* xacc[i] *= XXH_PRIME32_1 */ +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif + } + } +} +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = xacc[i]; + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] = acc_vec; + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +#if (XXH_VECTOR == XXH_LSX) +#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + __m128i* const xacc = (__m128i *) acc; + const __m128i* const xinput = (const __m128i *) input; + const __m128i* const xsecret = (const __m128i *) secret; + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = __lsx_vld(xinput + i, 0); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = __lsx_vld(xsecret + i, 0); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); + // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2)); + __m128i const sum = __lsx_vadd_d(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = __lsx_vadd_d(product, sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + __m128i* const xacc = (__m128i*) acc; + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = __lsx_vreplgr2vr_w((int)XXH_PRIME32_1); + + for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = __lsx_vsrli_d(acc_vec, 47); + __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = __lsx_vld(xsecret + i, 0); + __m128i const data_key = __lsx_vxor_v(data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = __lsx_vsrli_d(data_key, 32); + __m128i const prod_lo = __lsx_vmulwev_d_wu(data_key, prime32); + __m128i const prod_hi = __lsx_vmulwev_d_wu(data_key_hi, prime32); + xacc[i] = __lsx_vadd_d(prod_lo, __lsx_vslli_d(prod_hi, 32)); + } + } +} + +#endif + +/* scalar variants - universal */ + +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__GNUC__) && defined(__aarch64__) + /* + * UGLY HACK: + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes the compiler to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes the compiler to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_LSX) +#define XXH3_accumulate_512 XXH3_accumulate_512_lsx +#define XXH3_accumulate XXH3_accumulate_lsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_lsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH_PUREF XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +/* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + +static XXH_PUREF XXH64_hash_t +XXH3_finalizeLong_64b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 len) +{ + return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, len * XXH_PRIME64_1); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_finalizeLong_64b(acc, (const xxh_u8*)secret, (xxh_u64)len); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to @align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * @return An allocated pointer of @ref XXH3_state_t on success. + * @return `NULL` on failure. + * + * @note Must be freed with XXH3_freeState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * + * @return @ref XXH_OK. + * + * @note Must be allocated with XXH3_createState(). + * + * @see @ref streaming_example "Streaming Example" + */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + *nbStripesSoFarPtr += nbStripes; + } + /* Return end pointer */ + return input; +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + input = XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + + } + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + XXH_memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; + } else { /* bufferedSize < XXH_STRIPE_LEN */ + /* Copy to temp buffer */ + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + lastStripePtr = lastStripe; + } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_finalizeLong_64b(acc, secret, (xxh_u64)state->totalLen); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= PRIME_MX2; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +static XXH_PUREF XXH128_hash_t +XXH3_finalizeLong_128b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, xxh_u64 len) +{ + XXH128_hash_t h128; + h128.low64 = XXH3_finalizeLong_64b(acc, secret, len); + h128.high64 = XXH3_mergeAccs(acc, secret + secretSize + - XXH_STRIPE_LEN - XXH_SECRET_MERGEACCS_START, + ~(len * XXH_PRIME64_2)); + return h128; +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_finalizeLong_128b(acc, secret, secretSize, (xxh_u64)len); +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_update(state, input, len); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_finalizeLong_128b(acc, secret, state->secretLimit + XXH_STRIPE_LEN, (xxh_u64)state->totalLen); + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->useSeed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n +#include + +/* we use thin wrappers instead of taking addresses of stdlib functions directly + * on MSVC, malloc/calloc/realloc/free are __declspec(dllimport) and their + * address is not guaranteed to be static (warning C4232) */ +static void *real_malloc(size_t size) +{ + return malloc(size); +} +static void *real_calloc(size_t count, size_t size) +{ + return calloc(count, size); +} +static void *real_realloc(void *ptr, size_t size) +{ + return realloc(ptr, size); +} +static void real_free(void *ptr) +{ + free(ptr); +} + +#include "alloc.h" + +/* global allocator instance initialized with system defaults + * we initialize with real stdlib functions so malloc/free work before tidesdb_init is called */ +tidesdb_allocator_t tidesdb_allocator = { + .malloc_fn = real_malloc, + .calloc_fn = real_calloc, + .realloc_fn = real_realloc, + .free_fn = real_free, +}; + +_Atomic(int) tidesdb_initialized = 0; + +int tidesdb_init(tidesdb_malloc_fn malloc_fn, tidesdb_calloc_fn calloc_fn, + tidesdb_realloc_fn realloc_fn, tidesdb_free_fn free_fn) +{ + if (atomic_load_explicit(&tidesdb_initialized, memory_order_acquire)) + { + return -1; + } + + tidesdb_allocator.malloc_fn = malloc_fn ? malloc_fn : real_malloc; + tidesdb_allocator.calloc_fn = calloc_fn ? calloc_fn : real_calloc; + tidesdb_allocator.realloc_fn = realloc_fn ? realloc_fn : real_realloc; + tidesdb_allocator.free_fn = free_fn ? free_fn : real_free; + + /* we release fence ensures all function pointer writes are visible before + * any thread sees initialized=1 and starts calling through them */ + atomic_store_explicit(&tidesdb_initialized, 1, memory_order_release); + + return 0; +} + +void tidesdb_finalize(void) +{ + /** we set initialized to 0 first with release semantics so concurrent readers + * see the flag change before we overwrite the function pointers */ + atomic_store_explicit(&tidesdb_initialized, 0, memory_order_release); + atomic_thread_fence(memory_order_seq_cst); + + tidesdb_allocator.malloc_fn = real_malloc; + tidesdb_allocator.calloc_fn = real_calloc; + tidesdb_allocator.realloc_fn = real_realloc; + tidesdb_allocator.free_fn = real_free; +} + +void tidesdb_ensure_initialized(void) +{ + if (!atomic_load_explicit(&tidesdb_initialized, memory_order_acquire)) + { + tidesdb_init(NULL, NULL, NULL, NULL); + } +} diff --git a/storage/tidesdb/libtidesdb/src/alloc.h b/storage/tidesdb/libtidesdb/src/alloc.h new file mode 100644 index 0000000000000..14c230c47cc6b --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/alloc.h @@ -0,0 +1,145 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ALLOC_H__ +#define __ALLOC_H__ + +#include +#include +#include +#include + +/** + * tidesdb_malloc_fn + * function pointer type for malloc-like allocation + * @param size number of bytes to allocate + * @return pointer to allocated memory or NULL on failure + */ +typedef void *(*tidesdb_malloc_fn)(size_t size); + +/** + * tidesdb_calloc_fn + * function pointer type for calloc-like allocation + * @param count number of elements to allocate + * @param size size of each element in bytes + * @return pointer to zero-initialized memory or NULL on failure + */ +typedef void *(*tidesdb_calloc_fn)(size_t count, size_t size); + +/** + * tidesdb_realloc_fn + * function pointer type for realloc-like reallocation + * @param ptr pointer to previously allocated memory (or NULL) + * @param size new size in bytes + * @return pointer to reallocated memory or NULL on failure + */ +typedef void *(*tidesdb_realloc_fn)(void *ptr, size_t size); + +/** + * tidesdb_free_fn + * function pointer type for free-like deallocation + * @param ptr pointer to memory to free (may be NULL) + */ +typedef void (*tidesdb_free_fn)(void *ptr); + +/** + * tidesdb_allocator_t + * holds the allocator function pointers + * @param malloc_fn malloc function pointer + * @param calloc_fn calloc function pointer + * @param realloc_fn realloc function pointer + * @param free_fn free function pointer + */ +typedef struct tidesdb_allocator_t +{ + tidesdb_malloc_fn malloc_fn; + tidesdb_calloc_fn calloc_fn; + tidesdb_realloc_fn realloc_fn; + tidesdb_free_fn free_fn; +} tidesdb_allocator_t; + +extern tidesdb_allocator_t tidesdb_allocator; +extern _Atomic(int) tidesdb_initialized; + +/** + * tidesdb_init + * initializes TidesDB with optional custom memory allocation functions + * must be called exactly once before any other TidesDB function + * pass NULL for any function to use the default system allocator + * + * @param malloc_fn custom malloc function (or NULL for system malloc) + * @param calloc_fn custom calloc function (or NULL for system calloc) + * @param realloc_fn custom realloc function (or NULL for system realloc) + * @param free_fn custom free function (or NULL for system free) + * @return 0 on success, -1 if already initialized + */ +int tidesdb_init(tidesdb_malloc_fn malloc_fn, tidesdb_calloc_fn calloc_fn, + tidesdb_realloc_fn realloc_fn, tidesdb_free_fn free_fn); + +/** + * tidesdb_finalize + * finalizes TidesDB and resets the allocator + * should be called after all TidesDB operations are complete + * after calling this, tidesdb_init() can be called again + */ +void tidesdb_finalize(void); + +/** + * tidesdb_ensure_initialized + * internal function to auto-initialize with system allocator if not initialized + * called automatically by TidesDB methods + */ +void tidesdb_ensure_initialized(void); + +/* allocation macros that use the configured allocator */ +#define tdb_malloc(size) (tidesdb_allocator.malloc_fn(size)) +#define tdb_calloc(count, size) (tidesdb_allocator.calloc_fn((count), (size))) +#define tdb_realloc(ptr, size) (tidesdb_allocator.realloc_fn((ptr), (size))) +#define tdb_free(ptr) (tidesdb_allocator.free_fn(ptr)) + +/** + * override standard allocation functions. + * this allows existing code using malloc/calloc/realloc/free to automatically + */ +#undef malloc +#undef calloc +#undef realloc +#undef free +#define malloc(size) tdb_malloc(size) +#define calloc(count, size) tdb_calloc((count), (size)) +#define realloc(ptr, size) tdb_realloc((ptr), (size)) +#define free(ptr) tdb_free(ptr) + +/** + * tdb_strdup + * custom allocator-aware string duplication + * uses malloc (which is redirected to tdb_malloc above) so that the + * returned pointer can safely be freed via the custom allocator's free + * @param s the string to duplicate + * @return newly allocated copy of s, or NULL on failure + */ +static inline char *tdb_strdup(const char *s) +{ + if (!s) return NULL; + const size_t len = strlen(s) + 1; + char *dup = (char *)malloc(len); + if (dup) memcpy(dup, s, len); + return dup; +} + +#endif /* __ALLOC_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/block_manager.c b/storage/tidesdb/libtidesdb/src/block_manager.c new file mode 100644 index 0000000000000..0c1cad696aa7b --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/block_manager.c @@ -0,0 +1,1816 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "block_manager.h" + +#include "xxhash.h" + +#define BM_UNLIKELY(x) TDB_UNLIKELY(x) +#define BM_LIKELY(x) TDB_LIKELY(x) + +/* thread-local reusable pread buffer to avoid page faults on every block read. */ +#define BM_READ_BUF_INITIAL_SIZE (128 * 1024) + +/* payload bytes fetched together with the 8-byte block header in the first pread. + * a block whose payload fits within this hint is read in a single syscall; larger + * blocks pay one extra pread for the remainder. sized to cover the common data / + * index / footer block without over-reading huge bloom blocks. */ +#define BM_READ_HINT_BYTES (4u * 1024u) + +/* a block at or below this size is read without consulting the memory budget -- + * covers every data block and the common small footer block, so the hot read + * path is just integer compares. blocks larger than this (e.g. a multi-hundred- + * MB bloom filter on a huge bottom-level sstable) are rare and only there do we + * test the budget. the test itself is a relaxed atomic load, never a syscall. */ +#define BM_LARGE_BLOCK_BUDGET_CHECK_THRESHOLD (256u * 1024u * 1024u) + +/* memory-safety budget for a single block read, in bytes, pushed down from the + * tidesdb layer (resolved_memory_limit-derived) via + * block_manager_set_max_safe_block_bytes and refreshed by the reaper. 0 means + * "no budget configured" -- the size-vs-EOF check still applies, but no + * memory-based refusal happens (e.g. block_manager unit tests with no db). */ +static _Atomic(uint64_t) bm_max_safe_block_bytes = 0; + +void block_manager_set_max_safe_block_bytes(uint64_t bytes) +{ + atomic_store_explicit(&bm_max_safe_block_bytes, bytes, memory_order_relaxed); +} + +static pthread_key_t bm_tls_key; +static pthread_once_t bm_tls_once = PTHREAD_ONCE_INIT; + +/** + * + * * * * * * * * * * * + * FILE FORMAT * + * * * * * * * * * * * + * + * * * * * * * * * * * + * HEADER * + * * * * * * * * * * * + * magic (3 bytes) 0x544442 "TDB" -- see BLOCK_MANAGER_MAGIC + * version (1 byte) -- see BLOCK_MANAGER_VERSION + * padding (4 bytes) reserved + * + * * * * * * * * * * * + * BLOCKS * + * * * * * * * * * * * + * block_size (4 bytes) -- size of data (uint32_t, supports up to 4GB) + * checksum (4 bytes) -- xxHash32 of data + * data (variable size) -- actual block data + * footer_size (4 bytes) -- duplicate of block_size for validation + * footer_magic (4 bytes) -- 0x42445442 "BTDB" for fast validation + * + * * * * * * * * * * * + * CONCURRENCY MODEL * + * * * * * * * * * * * + * single file descriptor shared by all operations + * pread/pwrite for lock-free reads (readers don't block readers or writers) + * atomic offset allocation for lock-free writes + * writers don't block writers, concurrent writes to different offsets + * readers never block, they can read while writes happen + * + * * * * * * * * * * * + * REFERENCE COUNTING * + * * * * * * * * * * * + * blocks use atomic reference counting for safe concurrent access + * blocks start with ref_count=1 when created + * callers must call block_manager_block_release when done + * blocks are freed when ref_count reaches 0 + * block_manager_block_acquire/release provide thread-safe ref management + * global block cache in tidesdb.c uses these functions for safe sharing + */ + +typedef struct +{ + uint8_t *buf; + size_t capacity; +} bm_tls_read_buf_t; + +static void bm_tls_destructor(void *ptr) +{ + if (ptr) + { + bm_tls_read_buf_t *tls = (bm_tls_read_buf_t *)ptr; + free(tls->buf); + free(tls); + } +} + +static void bm_tls_init_key(void) +{ + pthread_key_create(&bm_tls_key, bm_tls_destructor); +} + +static uint8_t *bm_get_read_buf(const size_t needed) +{ + pthread_once(&bm_tls_once, bm_tls_init_key); + + bm_tls_read_buf_t *tls = (bm_tls_read_buf_t *)pthread_getspecific(bm_tls_key); + if (!tls) + { + tls = (bm_tls_read_buf_t *)calloc(1, sizeof(bm_tls_read_buf_t)); + if (!tls) return NULL; + pthread_setspecific(bm_tls_key, tls); + } + + if (BM_LIKELY(needed <= tls->capacity)) return tls->buf; + + size_t new_size = tls->capacity ? tls->capacity : BM_READ_BUF_INITIAL_SIZE; + while (new_size < needed) new_size *= 2; + + uint8_t *new_buf = (uint8_t *)realloc(tls->buf, new_size); + if (!new_buf) return NULL; + + tls->buf = new_buf; + tls->capacity = new_size; + return new_buf; +} + +/** + * odsync_available + * check if O_DSYNC is available on the specific platform + * @return 1 if O_DSYNC is available, 0 otherwise + */ +static inline int odsync_available(void) +{ + return O_DSYNC != 0; +} + +/** + * is_sync_full + * is a block manager in sync full mode? + * @param bm + * @return 1 if sync mode is full, 0 otherwise + */ +static inline int is_sync_full(const block_manager_t *bm) +{ + return bm->sync_full_cached; +} + +/** + * compute_checksum + * compute xxHash32 checksum + * @param data the data to compute the checksum for + * @param size the size of the data + * @return the 32-bit checksum + */ +static inline uint32_t compute_checksum(const void *data, const size_t size) +{ + return XXH32(data, size, 0); +} + +/** + * verify_checksum + * verify xxHash32 checksum + * @param data the data to verify the checksum for + * @param size the size of the data + * @param expected_checksum the expected checksum + * @return 0 if the checksum matches, -1 otherwise + */ +static inline int verify_checksum(const void *data, const size_t size, + const uint32_t expected_checksum) +{ + return (compute_checksum(data, size) == expected_checksum) ? 0 : -1; +} + +/** + * is_trailing_zero + * check whether the file region [start, end) consists entirely of zero bytes. + * used to distinguish preallocation tail (legitimate trailing zeros that should + * be tolerated by validation) from mid-write corruption (non-zero garbage). + * reads in chunks and stops early on the first non-zero byte. + * @param fd the file descriptor + * @param start start offset (inclusive) + * @param end end offset (exclusive) + * @return 1 if all bytes in [start, end) are zero, 0 if any non-zero byte found, -1 on I/O error + */ +static int is_trailing_zero(const int fd, const uint64_t start, const uint64_t end) +{ + if (start >= end) return 1; + + /* small on-stack chunk -- the loop re-reads, so a big buffer buys nothing and + * 64 KB on the stack is risky on platforms with small thread stacks */ + enum + { + SCAN_CHUNK = 8 * 1024 + }; + unsigned char buf[SCAN_CHUNK]; + + uint64_t pos = start; + while (pos < end) + { + size_t want = SCAN_CHUNK; + if ((uint64_t)want > end - pos) want = (size_t)(end - pos); + + const ssize_t got = pread(fd, buf, want, (off_t)pos); + if (got <= 0) return -1; + + for (ssize_t i = 0; i < got; i++) + { + if (buf[i] != 0) return 0; + } + pos += (uint64_t)got; + } + return 1; +} + +/** + * maybe_extend_allocation + * extends the on-disk preallocation when a new reservation gets within LOWWATER of + * the current preallocated extent. multiple writers may race here; the loop is + * lock-free and at worst causes a redundant fallocate (idempotent on overlapping + * ranges). on platforms without preallocation support, the first failure stamps + * preallocated_size with UINT64_MAX so the slow path is never retaken. + * @param bm the block manager + * @param reservation_end one past the last byte just reserved by the caller + */ +static inline void maybe_extend_allocation(block_manager_t *bm, const uint64_t reservation_end) +{ + for (;;) + { + const uint64_t prealloc = + atomic_load_explicit(&bm->preallocated_size, memory_order_acquire); + if (BM_LIKELY(reservation_end + BLOCK_MANAGER_PREALLOC_LOWWATER <= prealloc)) return; + + /* we round up to the next CHUNK boundary so successive extends stay aligned */ + const uint64_t target = + ((reservation_end + BLOCK_MANAGER_PREALLOC_CHUNK - 1) / BLOCK_MANAGER_PREALLOC_CHUNK) * + BLOCK_MANAGER_PREALLOC_CHUNK; + if (target <= prealloc) return; /* another writer already extended past us */ + + if (tdb_preallocate_extent(bm->fd, (off_t)prealloc, (off_t)(target - prealloc)) != 0) + { + /** unsupported on this fs/platform, disable further attempts. + * subsequent pwrites simply take the (slower) extending-write path. */ + atomic_store_explicit(&bm->preallocated_size, UINT64_MAX, memory_order_release); + return; + } + + uint64_t expected = prealloc; + if (atomic_compare_exchange_strong_explicit(&bm->preallocated_size, &expected, target, + memory_order_release, memory_order_acquire)) + { + return; + } + /* lost the CAS race; another writer also extended -- reload and re-check */ + } +} + +/** + * write_header + * write file header using pwrite + * @param fd the file descriptor to write to + * @return 0 if successful, -1 otherwise + */ +static int write_header(const int fd) +{ + unsigned char header[BLOCK_MANAGER_HEADER_SIZE]; + const uint32_t padding = 0; + + /* header format + * [3-byte magic][1-byte version][4-byte padding] = 8 bytes */ + encode_uint32_le_compat(header, BLOCK_MANAGER_MAGIC); + header[BLOCK_MANAGER_MAGIC_SIZE] = BLOCK_MANAGER_VERSION; + encode_uint32_le_compat(header + BLOCK_MANAGER_MAGIC_SIZE + BLOCK_MANAGER_VERSION_SIZE, + padding); + + const ssize_t written = pwrite(fd, header, BLOCK_MANAGER_HEADER_SIZE, 0); + return (written == BLOCK_MANAGER_HEADER_SIZE) ? 0 : -1; +} + +/** + * read_header + * read and validate file header using pread + * @param fd the file descriptor to read from + * @return 0 if successful, -1 otherwise + */ +static int read_header(const int fd) +{ + unsigned char header[BLOCK_MANAGER_HEADER_SIZE]; + + const ssize_t nread = pread(fd, header, BLOCK_MANAGER_HEADER_SIZE, 0); + if (nread != BLOCK_MANAGER_HEADER_SIZE) return -1; + + /* we decode magic using little-endian conversion for cross-platform compatibility */ + uint32_t magic = decode_uint32_le_compat(header); + magic &= BLOCK_MANAGER_MAGIC_MASK; + + if (magic != BLOCK_MANAGER_MAGIC) return -1; + + uint8_t version; + memcpy(&version, header + BLOCK_MANAGER_MAGIC_SIZE, BLOCK_MANAGER_VERSION_SIZE); + if (version != BLOCK_MANAGER_VERSION) return -1; + + return 0; +} + +/** + * get_file_size + * get file size using fstat + * @param fd the file descriptor to get the size of + * @param size the size to store the result in + * @return 0 if successful, -1 otherwise + */ +static int get_file_size(const int fd, uint64_t *size) +{ + struct STAT_STRUCT st; + if (FSTAT_FUNC(fd, &st) != 0) return -1; + *size = (uint64_t)st.st_size; + return 0; +} + +/** + * reopen_fd + * closes and reopens the block manager file descriptor with the same flags. + * NOT safe against concurrent readers: a reader that already captured bm->fd will + * pread on a closed (possibly recycled) descriptor. callers (truncate, permissive + * validation) must hold the bm exclusively / quiesce readers first. + * @param bm the block manager + * @return 0 if successful, -1 if not + */ +static int reopen_fd(block_manager_t *bm) +{ + close(bm->fd); + + int flags = O_RDWR | O_CREAT; + if (is_sync_full(bm) && odsync_available()) + { + flags |= O_DSYNC; + } + + bm->fd = open(bm->file_path, flags, BLOCK_MANAGER_FILE_MODE); + if (bm->fd == -1) return -1; + + return 0; +} + +/** + * truncate_to_header + * truncates a block manager file to just the header and syncs + * @param bm the block manager + * @return 0 if successful, -1 if not + */ +static int truncate_to_header(block_manager_t *bm) +{ + if (ftruncate(bm->fd, (off_t)BLOCK_MANAGER_HEADER_SIZE) == -1) return -1; + + /* ftruncate is not covered by O_DSYNC, we always sync truncation */ + if (is_sync_full(bm)) + { + fdatasync(bm->fd); + } + + atomic_store(&bm->current_file_size, BLOCK_MANAGER_HEADER_SIZE); + /** preallocation is invalidated by ftruncate; we reset to current size so the next + * write triggers a fresh extend */ + atomic_store(&bm->preallocated_size, BLOCK_MANAGER_HEADER_SIZE); + return 0; +} + +/** + * block_manager_open_internal + * opens a block manager (no cache) + * @param bm the block manager to open + * @param file_path the path of the file + * @param sync_mode the sync mode (TDB_SYNC_NONE, TDB_SYNC_FULL) + * @return 0 if successful, -1 if not + */ +static int block_manager_open_internal(block_manager_t **bm, const char *file_path, + const block_manager_sync_mode_t sync_mode) +{ + block_manager_t *new_bm = malloc(sizeof(block_manager_t)); + if (!new_bm) + { + *bm = NULL; + return -1; + } + + /* we initialize atomic variable to prevent reading uninitialized memory */ + atomic_init(&new_bm->current_file_size, 0); + atomic_init(&new_bm->preallocated_size, 0); + atomic_init(&new_bm->group_durable_size, 0); + atomic_init(&new_bm->group_sync_active, 0); + + new_bm->sync_mode = sync_mode; + new_bm->sync_full_cached = (sync_mode == BLOCK_MANAGER_SYNC_FULL); + + const int file_exists = access(file_path, F_OK) == 0; + + int flags = O_RDWR | O_CREAT; + + /* we use O_DSYNC for synchronous data writes in SYNC_FULL mode + * this ensures each pwrite is durable before returning, eliminating + * the need for per-write fdatasync() calls on platforms that support it. + * this is also faster, less syscalls, for example + */ + if (is_sync_full(new_bm) && odsync_available()) + { + flags |= O_DSYNC; + } + + const mode_t mode = BLOCK_MANAGER_FILE_MODE; + + new_bm->fd = open(file_path, flags, mode); + if (new_bm->fd == -1) + { + /* preserve the open() errno across free() so the caller can report the real cause + * (EMFILE/ENFILE = fd exhaustion, ENOSPC = disk full, EACCES, ...) */ + const int open_errno = errno; + free(new_bm); + *bm = NULL; + errno = open_errno; + return -1; + } + + strncpy(new_bm->file_path, file_path, MAX_FILE_PATH_LENGTH - 1); + new_bm->file_path[MAX_FILE_PATH_LENGTH - 1] = '\0'; + + if (file_exists) + { + if (read_header(new_bm->fd) != 0) + { + const int hdr_errno = errno; + close(new_bm->fd); + free(new_bm); + *bm = NULL; + errno = hdr_errno; + return -1; + } + } + else + { + if (write_header(new_bm->fd) != 0) + { + const int hdr_errno = errno; + close(new_bm->fd); + free(new_bm); + *bm = NULL; + errno = hdr_errno; + return -1; + } + /* if O_DSYNC is available, pwrite already synced the header + * otherwise fall back to explicit fdatasync */ + if (is_sync_full(new_bm) && !odsync_available()) + { + if (fdatasync(new_bm->fd) != 0) + { + const int sync_errno = errno; + close(new_bm->fd); + free(new_bm); + *bm = NULL; + errno = sync_errno; + return -1; + } + } + } + + /* we set current_file_size if not already set by validation */ + if (atomic_load(&new_bm->current_file_size) == 0) + { + uint64_t file_size = 0; + if (get_file_size(new_bm->fd, &file_size) == 0) + { + atomic_store(&new_bm->current_file_size, file_size); + } + else + { + /* if we can't get size, use lseek to get current position (end of file) */ + const off_t pos = lseek(new_bm->fd, 0, SEEK_END); + atomic_store(&new_bm->current_file_size, (pos >= 0) ? (uint64_t)pos : 0); + } + } + + /* preallocated extent starts at the current file size; first write will extend it */ + atomic_store(&new_bm->preallocated_size, atomic_load(&new_bm->current_file_size)); + + *bm = new_bm; + return 0; +} + +int block_manager_close(block_manager_t *bm) +{ + if (!bm) return -1; + + /* preallocation advances logical EOF past actual data; trim back so next-open + * validation sees the real tail block instead of trailing zeros. crash recovery + * still has to tolerate trailing zeros (size_field == 0 marks the boundary). */ + const uint64_t valid_size = atomic_load(&bm->current_file_size); + const uint64_t prealloc = atomic_load(&bm->preallocated_size); + if (prealloc != UINT64_MAX && prealloc > valid_size && bm->fd >= 0) + { + /* best-effort -- if it fails, next-open validate_last_block tolerates the + * trailing-zero preallocation tail. (void) cast doesn't suppress glibc's + * warn_unused_result, hence the explicit if. */ + if (ftruncate(bm->fd, (off_t)valid_size) != 0) + { + /* swallow */ + } + } + + /* final sync on close -- we really only needed if O_DSYNC wasnt used + * with O_DSYNC, all writes are already durable */ + if (is_sync_full(bm) && !odsync_available()) + { + (void)fdatasync(bm->fd); + } + + int close_result = 0; + if (bm->fd >= 0 && close(bm->fd) != 0) + { + close_result = -1; + } + + free(bm); + + return close_result; +} + +block_manager_block_t *block_manager_block_create(const uint64_t size, const void *data) +{ + if (size > UINT32_MAX) + { + return NULL; + } + + block_manager_block_t *block = malloc(sizeof(block_manager_block_t)); + if (!block) return NULL; + + block->size = size; + atomic_init(&block->ref_count, 1); + block->inline_data = 0; + + block->data = malloc(size); + if (!block->data) + { + free(block); + return NULL; + } + + /* we only copy if size > 0 and data is not NULL */ + if (size > 0 && data != NULL) + { + memcpy(block->data, data, size); + } + return block; +} + +block_manager_block_t *block_manager_block_create_from_buffer(const uint64_t size, void *data) +{ + if (size > UINT32_MAX) + { + return NULL; + } + + block_manager_block_t *block = malloc(sizeof(block_manager_block_t)); + if (!block) return NULL; + + block->size = size; + block->data = data; + atomic_init(&block->ref_count, 1); + block->inline_data = 0; + return block; +} + +/** + * bm_append_block + * append one framed block [size][checksum][data][size][magic] at the atomically + * reserved tail offset via a single pwritev. shared by block_write and write_raw + * so the on-disk encoding lives in one place. data must be non-NULL and size + * non-zero -- the caller validates (a zero size_field reads back as EOF). + * @return the offset written at, or -1 on failure + */ +static int64_t bm_append_block(block_manager_t *bm, const void *data, const uint32_t size) +{ + const size_t total_size = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (size_t)size + BLOCK_MANAGER_FOOTER_SIZE; + const uint32_t checksum = compute_checksum(data, size); + + /* atomically reserve space, then extend preallocation so the pwrite stays in-place */ + const int64_t offset = (int64_t)atomic_fetch_add(&bm->current_file_size, total_size); + (void)maybe_extend_allocation(bm, (uint64_t)offset + total_size); + + unsigned char header[BLOCK_MANAGER_BLOCK_HEADER_SIZE]; + encode_uint32_le_compat(header, size); + encode_uint32_le_compat(header + BLOCK_MANAGER_SIZE_FIELD_SIZE, checksum); + + unsigned char footer[BLOCK_MANAGER_FOOTER_SIZE]; + encode_uint32_le_compat(footer, size); + encode_uint32_le_compat(footer + BLOCK_MANAGER_CHECKSUM_LENGTH, BLOCK_MANAGER_FOOTER_MAGIC); + + /* header + data + footer in a single pwritev -- zero copy from data */ + struct iovec iov[BLOCK_MANAGER_IOVECS_PER_BLOCK]; + iov[0].iov_base = header; + iov[0].iov_len = BLOCK_MANAGER_BLOCK_HEADER_SIZE; + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + iov[2].iov_base = footer; + iov[2].iov_len = BLOCK_MANAGER_FOOTER_SIZE; + + if (BM_UNLIKELY(tdb_pwritev_safe(bm->fd, iov, BLOCK_MANAGER_IOVECS_PER_BLOCK, (off_t)offset) != + (ssize_t)total_size)) + return -1; + + /* with O_DSYNC the pwrite already synced; otherwise fall back to fdatasync */ + if (is_sync_full(bm) && !odsync_available()) + { + if (fdatasync(bm->fd) != 0) return -1; + } + + return offset; +} + +int64_t block_manager_block_write(block_manager_t *bm, block_manager_block_t *block) +{ + if (BM_UNLIKELY(!bm || !block)) return -1; + + /* block size is stored as uint32_t, thus enforced 4GB limit */ + if (BM_UNLIKELY(block->size > UINT32_MAX)) return -1; + + /* a zero-size block encodes size_field == 0, which every reader treats as EOF; + * reject it so it can never truncate iteration (matches write_raw) */ + if (BM_UNLIKELY(block->size == 0)) return -1; + + /* guard size_t overflow of the framed total on 32-bit platforms */ + if (block->size > SIZE_MAX - BLOCK_MANAGER_BLOCK_HEADER_SIZE - BLOCK_MANAGER_FOOTER_SIZE) + return -1; + + return bm_append_block(bm, block->data, (uint32_t)block->size); +} + +int64_t block_manager_write_raw(block_manager_t *bm, const void *data, const uint32_t size) +{ + if (BM_UNLIKELY(!bm || !data || size == 0)) return -1; + return bm_append_block(bm, data, size); +} + +/* maximum iovecs per pwritev call, POSIX minimum is 16, Linux uses 1024 */ +#ifndef BM_IOV_MAX +#define BM_IOV_MAX 1024 +#endif + +int block_manager_block_write_batch(block_manager_t *bm, block_manager_block_t **blocks, + const size_t count, int64_t *offsets) +{ + if (BM_UNLIKELY(!bm || !blocks || count == 0 || !offsets)) return -1; + + /* we calculate total size needed and count valid blocks */ + size_t total_batch_size = 0; + size_t valid_count = 0; + for (size_t i = 0; i < count; i++) + { + if (!blocks[i]) + { + offsets[i] = -1; + continue; + } + if (blocks[i]->size > UINT32_MAX) return -1; + + total_batch_size += + BLOCK_MANAGER_BLOCK_HEADER_SIZE + blocks[i]->size + BLOCK_MANAGER_FOOTER_SIZE; + valid_count++; + } + + if (total_batch_size == 0) return 0; + + /* we atomically allocate space for all blocks at once */ + const int64_t base_offset = (int64_t)atomic_fetch_add(&bm->current_file_size, total_batch_size); + + (void)maybe_extend_allocation(bm, (uint64_t)base_offset + total_batch_size); + + const size_t meta_size = + valid_count * (BLOCK_MANAGER_BLOCK_HEADER_SIZE + BLOCK_MANAGER_FOOTER_SIZE); + const size_t iov_count = valid_count * BLOCK_MANAGER_IOVECS_PER_BLOCK; + unsigned char *alloc = malloc(meta_size + iov_count * sizeof(struct iovec)); + if (!alloc) return -1; + + unsigned char *meta_buf = alloc; + struct iovec *iov = (struct iovec *)(alloc + meta_size); + + /* we build iovecs, header and footer go into meta_buf, data points directly to block->data */ + int64_t current_offset = base_offset; + size_t iov_idx = 0; + size_t meta_idx = 0; + + for (size_t i = 0; i < count; i++) + { + if (!blocks[i]) continue; + + block_manager_block_t *block = blocks[i]; + const size_t block_total = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + block->size + BLOCK_MANAGER_FOOTER_SIZE; + + offsets[i] = current_offset; + + /* we encode header and footer into contiguous metadata buffer */ + unsigned char *hdr = + meta_buf + meta_idx * (BLOCK_MANAGER_BLOCK_HEADER_SIZE + BLOCK_MANAGER_FOOTER_SIZE); + unsigned char *ftr = hdr + BLOCK_MANAGER_BLOCK_HEADER_SIZE; + + const uint32_t checksum = compute_checksum(block->data, block->size); + encode_uint32_le_compat(hdr, (uint32_t)block->size); + encode_uint32_le_compat(hdr + BLOCK_MANAGER_SIZE_FIELD_SIZE, checksum); + encode_uint32_le_compat(ftr, (uint32_t)block->size); + encode_uint32_le_compat(ftr + BLOCK_MANAGER_CHECKSUM_LENGTH, BLOCK_MANAGER_FOOTER_MAGIC); + + iov[iov_idx].iov_base = hdr; + iov[iov_idx].iov_len = BLOCK_MANAGER_BLOCK_HEADER_SIZE; + iov[iov_idx + 1].iov_base = block->data; + iov[iov_idx + 1].iov_len = block->size; + iov[iov_idx + 2].iov_base = ftr; + iov[iov_idx + 2].iov_len = BLOCK_MANAGER_FOOTER_SIZE; + + iov_idx += BLOCK_MANAGER_IOVECS_PER_BLOCK; + meta_idx++; + current_offset += (int64_t)block_total; + } + + /* we write in BM_IOV_MAX-sized chunks for batches that exceed the iovec limit */ + size_t iov_done = 0; + off_t write_offset = (off_t)base_offset; + + while (iov_done < iov_idx) + { + int chunk = (int)(iov_idx - iov_done); + if (chunk > BM_IOV_MAX) chunk = BM_IOV_MAX; + + ssize_t expected = 0; + for (int j = 0; j < chunk; j++) expected += (ssize_t)iov[iov_done + j].iov_len; + + const ssize_t written = tdb_pwritev_safe(bm->fd, iov + iov_done, chunk, write_offset); + if (written != expected) + { + free(alloc); + for (size_t i = 0; i < count; i++) offsets[i] = -1; + return -1; + } + + write_offset += written; + iov_done += (size_t)chunk; + } + + free(alloc); + + /* we sync if needed */ + if (is_sync_full(bm) && !odsync_available()) + { + if (fdatasync(bm->fd) != 0) + { + return -1; + } + } + + return (int)valid_count; +} + +int block_manager_write_at(block_manager_t *bm, const int64_t offset, const uint8_t *data, + const size_t size) +{ + if (!bm || !data || size == 0 || offset < 0) return -1; + + /* this only patches existing data -- a write past the tracked extent would + * grow the file without advancing current_file_size, desyncing the two */ + if ((uint64_t)offset + size > atomic_load(&bm->current_file_size)) return -1; + + const ssize_t written = pwrite(bm->fd, data, size, offset); + if (written != (ssize_t)size) + { + return -1; + } + + if (is_sync_full(bm) && !odsync_available()) + { + if (fdatasync(bm->fd) != 0) + { + return -1; + } + } + + return 0; +} + +int block_manager_update_checksum(block_manager_t *bm, const int64_t block_offset) +{ + if (!bm || block_offset < 0) return -1; + + /* we read block size from header */ + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, block_offset) != + BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + return -1; + } + + const uint32_t block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) return -1; + + /* we use thread-local buffer to avoid page faults from fresh malloc pages */ + uint8_t *data = bm_get_read_buf(block_size); + if (!data) return -1; + + const off_t data_offset = block_offset + BLOCK_MANAGER_BLOCK_HEADER_SIZE; + if (pread(bm->fd, data, block_size, data_offset) != (ssize_t)block_size) + { + return -1; + } + + const uint32_t new_checksum = compute_checksum(data, block_size); + + unsigned char checksum_buf[BLOCK_MANAGER_CHECKSUM_LENGTH]; + encode_uint32_le_compat(checksum_buf, new_checksum); + + const off_t checksum_offset = block_offset + BLOCK_MANAGER_SIZE_FIELD_SIZE; + if (pwrite(bm->fd, checksum_buf, BLOCK_MANAGER_CHECKSUM_LENGTH, checksum_offset) != + BLOCK_MANAGER_CHECKSUM_LENGTH) + { + return -1; + } + + if (is_sync_full(bm) && !odsync_available()) + { + if (fdatasync(bm->fd) != 0) + { + return -1; + } + } + + return 0; +} + +void block_manager_block_free(block_manager_block_t *block) +{ + if (!block) return; + + if (!block->inline_data && block->data) free(block->data); + free(block); +} + +int block_manager_block_acquire(block_manager_block_t *block) +{ + if (!block) return 0; + + uint32_t old_ref = atomic_load_explicit(&block->ref_count, memory_order_relaxed); + do + { + if (old_ref == 0) return 0; /* block is being freed */ + } while (!atomic_compare_exchange_weak_explicit(&block->ref_count, &old_ref, old_ref + 1, + memory_order_acquire, memory_order_relaxed)); + return 1; +} + +void block_manager_block_release(block_manager_block_t *block) +{ + if (!block) return; + + const uint32_t old_ref = atomic_fetch_sub_explicit(&block->ref_count, 1, memory_order_release); + if (old_ref == 1) + { + /* we were the last reference, free the block */ + atomic_thread_fence(memory_order_acquire); + block_manager_block_free(block); + } +} + +int block_manager_cursor_init_stack(block_manager_cursor_t *cursor, block_manager_t *bm) +{ + if (!cursor || !bm) return -1; + + cursor->bm = bm; + + /* we initialize to position before first block */ + cursor->current_pos = BLOCK_MANAGER_HEADER_SIZE; + cursor->current_block_size = 0; + cursor->block_index = -1; /* -1 means before first block */ + cursor->block_size_valid = 0; + + /* we position at first block so cursor_read works immediately */ + block_manager_cursor_goto_first(cursor); + + return 0; +} + +int block_manager_cursor_init(block_manager_cursor_t **cursor, block_manager_t *bm) +{ + if (!bm) return -1; + + (*cursor) = malloc(sizeof(block_manager_cursor_t)); + if (!(*cursor)) return -1; + + const int rc = block_manager_cursor_init_stack(*cursor, bm); + if (rc == 0) + { + /* heap-allocated cursors are used for sequential iteration + * we hint to OS for read-ahead optimization */ + set_file_sequential_hint(bm->fd); + } + return rc; +} + +int block_manager_cursor_next(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + uint32_t block_size; + + /* we use cached block size if valid, otherwise read from disk */ + if (cursor->block_size_valid && cursor->current_block_size > 0) + { + block_size = (uint32_t)cursor->current_block_size; + } + else + { + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + const ssize_t nread = pread(cursor->bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, + (off_t)cursor->current_pos); + if (nread != BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + if (nread == 0) return 1; /* EOF */ + return -1; + } + block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) return -1; /* invalid block */ + } + + /* next block starts after, [size][checksum][data][footer_size][footer_magic] */ + cursor->current_pos += + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (uint64_t)block_size + BLOCK_MANAGER_FOOTER_SIZE; + cursor->current_block_size = 0; + cursor->block_size_valid = 0; /* we invalidate cache after moving */ + cursor->block_index++; + + return 0; +} + +int block_manager_cursor_has_next(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + const uint64_t file_size = atomic_load(&cursor->bm->current_file_size); + if (cursor->current_pos >= file_size) return 0; /* at or past EOF */ + + /** we use cached block size if valid */ + if (cursor->block_size_valid && cursor->current_block_size > 0) + { + return 1; + } + + /* we read current block size to check if current block is valid */ + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + const ssize_t nread = + pread(cursor->bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, (off_t)cursor->current_pos); + if (nread != BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + if (nread == 0) return 0; /* EOF */ + return -1; + } + + const uint32_t block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) return -1; /* invalid block */ + + /* we cache the block size for subsequent cursor_next call */ + cursor->current_block_size = block_size; + cursor->block_size_valid = 1; + + /* has_next returns 1 if cursor_next would succeed (can read current block and move forward) */ + return 1; +} + +int block_manager_cursor_has_prev(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + return (cursor->current_pos > BLOCK_MANAGER_HEADER_SIZE) ? 1 : 0; +} + +int block_manager_cursor_skip_corrupt(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + /* we read the size field from the current position */ + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(cursor->bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, + (off_t)cursor->current_pos) != BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + return -1; + } + + const uint32_t block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) return -1; /* zero-filled hole extent unknown, cannot advance */ + + /* read footer magic to distinguish partial write from genuine corruption. + * footer layout [footer_size(4)][footer_magic(4)]; footer_magic sits at + * (current_pos + BLOCK_HEADER_SIZE + block_size + SIZE_FIELD_SIZE) */ + const off_t footer_magic_offset = (off_t)cursor->current_pos + BLOCK_MANAGER_BLOCK_HEADER_SIZE + + (off_t)block_size + BLOCK_MANAGER_SIZE_FIELD_SIZE; + unsigned char magic_buf[BLOCK_MANAGER_CHECKSUM_LENGTH]; + const ssize_t nread = + pread(cursor->bm->fd, magic_buf, BLOCK_MANAGER_CHECKSUM_LENGTH, footer_magic_offset); + if (nread != BLOCK_MANAGER_CHECKSUM_LENGTH) + { + /* footer not present so file truncated mid-block; treat as partial write */ + cursor->current_pos += + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (uint64_t)block_size + BLOCK_MANAGER_FOOTER_SIZE; + cursor->current_block_size = 0; + cursor->block_size_valid = 0; + cursor->block_index++; + return 0; + } + + const uint32_t footer_magic = decode_uint32_le_compat(magic_buf); + if (footer_magic == BLOCK_MANAGER_FOOTER_MAGIC) + { + return -1; + } + + cursor->current_pos += + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (uint64_t)block_size + BLOCK_MANAGER_FOOTER_SIZE; + cursor->current_block_size = 0; + cursor->block_size_valid = 0; + cursor->block_index++; + return 0; +} + +/** + * bm_read_block_tls + * reads a full block (header + payload) at `offset` into the thread-local buffer. + * the first pread grabs the header plus BM_READ_HINT_BYTES of payload, so a block + * within the hint costs a single syscall; a larger block pays one more pread for + * the remainder. the checksum is verified before returning. + * @param fd the file descriptor + * @param offset the file offset of the block (start of header) + * @param extent_limit if non-zero, reject a block whose frame extends past this + * byte offset (guards against garbage sizes); 0 skips the check + * @param check_budget if non-zero, refuse a payload larger than the memory budget + * @param out_size set to the payload size on success + * @return pointer to the verified payload inside the TLS buffer, or NULL on failure + */ +static uint8_t *bm_read_block_tls(const int fd, const uint64_t offset, const uint64_t extent_limit, + const int check_budget, uint32_t *out_size) +{ + /* first pread -- header + a hint of payload in one syscall */ + uint8_t *buf = bm_get_read_buf(BLOCK_MANAGER_BLOCK_HEADER_SIZE + BM_READ_HINT_BYTES); + if (BM_UNLIKELY(!buf)) return NULL; + + const ssize_t got = + pread(fd, buf, BLOCK_MANAGER_BLOCK_HEADER_SIZE + BM_READ_HINT_BYTES, (off_t)offset); + if (BM_UNLIKELY(got < (ssize_t)BLOCK_MANAGER_BLOCK_HEADER_SIZE)) return NULL; + + const uint32_t size = decode_uint32_le_compat(buf); + if (BM_UNLIKELY(size == 0)) return NULL; + const uint32_t checksum = decode_uint32_le_compat(buf + BLOCK_MANAGER_SIZE_FIELD_SIZE); + + /* a block claiming to extend past the data extent is garbage (off-boundary + * read, torn write, corruption) -- reject before reading/allocating trash */ + if (extent_limit) + { + const uint64_t frame_end = + offset + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (uint64_t)size + BLOCK_MANAGER_FOOTER_SIZE; + if (BM_UNLIKELY(frame_end > extent_limit)) return NULL; + } + + /* only large blocks consult the budget (relaxed atomic load, no syscall); a + * block over budget is skipped so the caller degrades instead of OOMing */ + if (check_budget && BM_UNLIKELY(size > BM_LARGE_BLOCK_BUDGET_CHECK_THRESHOLD)) + { + const uint64_t budget = + atomic_load_explicit(&bm_max_safe_block_bytes, memory_order_relaxed); + if (budget > 0 && (uint64_t)size > budget) return NULL; + } + + /* payload bytes already in buf (the first read may also have pulled the footer + * and into the next block -- clamp to the real payload length) */ + uint32_t have = (uint32_t)got - BLOCK_MANAGER_BLOCK_HEADER_SIZE; + if (have > size) have = size; + + if (size > have) + { + /* grow the TLS buffer if needed -- realloc preserves the bytes already read */ + buf = bm_get_read_buf(BLOCK_MANAGER_BLOCK_HEADER_SIZE + size); + if (BM_UNLIKELY(!buf)) return NULL; + + const off_t rem_offset = (off_t)offset + BLOCK_MANAGER_BLOCK_HEADER_SIZE + have; + if (BM_UNLIKELY(pread(fd, buf + BLOCK_MANAGER_BLOCK_HEADER_SIZE + have, size - have, + rem_offset) != (ssize_t)(size - have))) + return NULL; + } + + uint8_t *payload = buf + BLOCK_MANAGER_BLOCK_HEADER_SIZE; + if (BM_UNLIKELY(verify_checksum(payload, size, checksum) != 0)) return NULL; + + *out_size = size; + return payload; +} + +/** + * block_manager_read_block_at_offset + * reads a block at a specific offset + * @param bm the block manager + * @param offset the offset to read from + * @return the block if successful, NULL otherwise + */ +static block_manager_block_t *block_manager_read_block_at_offset(block_manager_t *bm, + const uint64_t offset) +{ + if (BM_UNLIKELY(!bm)) return NULL; + + /* enforce the data extent so a garbage size can't drive a read/alloc past EOF; + * file_size 0 means "size not yet known" -- skip the check as before */ + const uint64_t file_size = atomic_load_explicit(&bm->current_file_size, memory_order_acquire); + + uint32_t block_size = 0; + uint8_t *payload = bm_read_block_tls(bm->fd, offset, file_size, 1, &block_size); + if (BM_UNLIKELY(!payload)) return NULL; + + block_manager_block_t *block = malloc(sizeof(block_manager_block_t) + block_size); + if (!block) return NULL; + + block->size = block_size; + block->data = (uint8_t *)(block + 1); + block->inline_data = 1; + atomic_init(&block->ref_count, 1); + + memcpy(block->data, payload, block_size); + return block; +} + +block_manager_block_t *block_manager_cursor_read(block_manager_cursor_t *cursor) +{ + if (!cursor) return NULL; + + block_manager_block_t *block = + block_manager_read_block_at_offset(cursor->bm, cursor->current_pos); + if (block) + { + /* we cache block size so cursor_next skips the pread for size header */ + cursor->current_block_size = block->size; + cursor->block_size_valid = 1; + } + return block; +} + +block_manager_block_t *block_manager_cursor_read_partial(block_manager_cursor_t *cursor, + const size_t max_bytes) +{ + if (!cursor) return NULL; + if (max_bytes == 0) return block_manager_cursor_read(cursor); + + block_manager_t *bm = cursor->bm; + const uint64_t offset = cursor->current_pos; + + /* we use cached block size to avoid redundant pread syscall */ + uint32_t block_size; + if (cursor->block_size_valid && cursor->current_block_size > 0) + { + block_size = (uint32_t)cursor->current_block_size; + } + else + { + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, (off_t)offset) != + BLOCK_MANAGER_SIZE_FIELD_SIZE) + return NULL; + block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) return NULL; + } + + /* if block is smaller than max_bytes, we read full block */ + if (block_size <= max_bytes) + { + return block_manager_read_block_at_offset(bm, offset); + } + + block_manager_block_t *block = malloc(sizeof(block_manager_block_t)); + if (!block) return NULL; + + block->size = max_bytes; + atomic_init(&block->ref_count, 1); + block->inline_data = 0; + block->data = malloc(max_bytes); + if (!block->data) + { + free(block); + return NULL; + } + + /* we read only first max_bytes of data */ + const off_t data_pos = (off_t)offset + (off_t)BLOCK_MANAGER_BLOCK_HEADER_SIZE; + if (pread(bm->fd, block->data, max_bytes, data_pos) != (ssize_t)max_bytes) + { + free(block->data); + free(block); + return NULL; + } + + /* we don't verify checksum for partial reads since we don't have full data */ + return block; +} + +block_manager_block_t *block_manager_cursor_read_and_advance(block_manager_cursor_t *cursor) +{ + if (!cursor) return NULL; + + block_manager_block_t *block = + block_manager_read_block_at_offset(cursor->bm, cursor->current_pos); + if (!block) return NULL; + + /* we advance cursor using the block size we just read, avoiding redundant pread */ + cursor->current_pos += + BLOCK_MANAGER_BLOCK_HEADER_SIZE + block->size + BLOCK_MANAGER_FOOTER_SIZE; + cursor->current_block_size = 0; + cursor->block_size_valid = 0; /* invalidate cache -- we moved to a new position */ + cursor->block_index++; + + return block; +} + +void block_manager_cursor_free(block_manager_cursor_t *cursor) +{ + if (cursor) + { + free(cursor); + } +} + +int block_manager_cursor_prev(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + /* we are already at first block position, we can't go back */ + if (cursor->current_pos <= BLOCK_MANAGER_HEADER_SIZE) return -1; + + const uint64_t prev_footer_end = cursor->current_pos; + if (prev_footer_end < + BLOCK_MANAGER_HEADER_SIZE + BLOCK_MANAGER_BLOCK_HEADER_SIZE + BLOCK_MANAGER_FOOTER_SIZE) + { + return -1; /* not enough space for a valid previous block */ + } + + unsigned char footer_buf[BLOCK_MANAGER_FOOTER_SIZE]; + const off_t footer_offset = (off_t)(prev_footer_end - BLOCK_MANAGER_FOOTER_SIZE); + if (pread(cursor->bm->fd, footer_buf, BLOCK_MANAGER_FOOTER_SIZE, footer_offset) != + BLOCK_MANAGER_FOOTER_SIZE) + { + return -1; + } + + const uint32_t prev_block_size = decode_uint32_le_compat(footer_buf); + const uint32_t footer_magic = + decode_uint32_le_compat(footer_buf + BLOCK_MANAGER_CHECKSUM_LENGTH); + + /* we validate footer magic */ + if (footer_magic != BLOCK_MANAGER_FOOTER_MAGIC || prev_block_size == 0) + { + return -1; + } + + /* we calculate start of previous block */ + const uint64_t prev_total_size = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + prev_block_size + BLOCK_MANAGER_FOOTER_SIZE; + if (cursor->current_pos < prev_total_size) + { + return -1; /* invalid -- would underflow */ + } + + const uint64_t prev_block_start = cursor->current_pos - prev_total_size; + if (prev_block_start < BLOCK_MANAGER_HEADER_SIZE) + { + return -1; /* invalid -- before file header */ + } + + cursor->current_pos = prev_block_start; + cursor->current_block_size = prev_block_size; + cursor->block_size_valid = 1; /* we know the size from footer */ + cursor->block_index--; + + return 0; +} + +int block_manager_cursor_goto_first(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + cursor->current_pos = BLOCK_MANAGER_HEADER_SIZE; + cursor->current_block_size = 0; + cursor->block_index = -1; + cursor->block_size_valid = 0; + + return 0; +} + +int block_manager_cursor_goto_last_before(block_manager_cursor_t *cursor, const uint64_t end_offset) +{ + if (!cursor) return -1; + + if (end_offset <= BLOCK_MANAGER_HEADER_SIZE) + { + return -1; + } + + /* we read footer of last block to get its size */ + unsigned char footer_buf[BLOCK_MANAGER_FOOTER_SIZE]; + const off_t footer_offset = (off_t)(end_offset - BLOCK_MANAGER_FOOTER_SIZE); + const ssize_t n = pread(cursor->bm->fd, footer_buf, BLOCK_MANAGER_FOOTER_SIZE, footer_offset); + + if (n != BLOCK_MANAGER_FOOTER_SIZE) + { + return -1; + } + + const uint32_t block_size = decode_uint32_le_compat(footer_buf); + const uint32_t footer_magic = + decode_uint32_le_compat(footer_buf + BLOCK_MANAGER_CHECKSUM_LENGTH); + + /* we verify footer magic */ + if (footer_magic != BLOCK_MANAGER_FOOTER_MAGIC || block_size == 0) + { + return -1; + } + + /* we calculate start position of last block */ + const uint64_t total_block_size = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + block_size + BLOCK_MANAGER_FOOTER_SIZE; + if (end_offset < total_block_size) + { + return -1; + } + + cursor->current_pos = end_offset - total_block_size; + cursor->current_block_size = block_size; + cursor->block_size_valid = 1; /* we know the size from footer */ + cursor->block_index = -1; /* unknown index */ + + return 0; +} + +int block_manager_cursor_goto_last(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + /* O(1) seek to end and work backwards using footer */ + const uint64_t file_size = atomic_load(&cursor->bm->current_file_size); + return block_manager_cursor_goto_last_before(cursor, file_size); +} + +int block_manager_truncate(block_manager_t *bm) +{ + if (!bm) return -1; + + /* we truncate to header-only (preserves valid header, single sync) */ + if (truncate_to_header(bm) != 0) return -1; + + /* reopen the fd so any stale O_APPEND/seek state is reset and the descriptor + * reflects the freshly truncated file (caller must have quiesced readers) */ + if (reopen_fd(bm) != 0) return -1; + + return 0; +} + +int block_manager_cursor_at_first(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + return (cursor->current_pos == BLOCK_MANAGER_HEADER_SIZE) ? 1 : 0; +} + +int block_manager_cursor_at_second(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + /* if at first block, not at second */ + if (cursor->current_pos == BLOCK_MANAGER_HEADER_SIZE) return 0; + + /* we read first block size */ + unsigned char first_size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(cursor->bm->fd, first_size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, + (off_t)BLOCK_MANAGER_HEADER_SIZE) != BLOCK_MANAGER_SIZE_FIELD_SIZE) + return -1; + const uint32_t first_block_size = decode_uint32_le_compat(first_size_buf); + if (first_block_size == 0) return -1; + + /* we calculate second block position, first_block_pos + [size][checksum][data][footer] */ + const uint64_t first_total_size = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (uint64_t)first_block_size + BLOCK_MANAGER_FOOTER_SIZE; + const uint64_t second_block_pos = BLOCK_MANAGER_HEADER_SIZE + first_total_size; + + return (cursor->current_pos == second_block_pos) ? 1 : 0; +} + +int block_manager_cursor_at_last(block_manager_cursor_t *cursor) +{ + if (!cursor) return -1; + + /* we use cached block size to avoid pread syscall when possible */ + uint32_t block_size; + if (cursor->block_size_valid && cursor->current_block_size > 0) + { + block_size = (uint32_t)cursor->current_block_size; + } + else + { + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(cursor->bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, + (off_t)cursor->current_pos) != BLOCK_MANAGER_SIZE_FIELD_SIZE) + return -1; + block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) return -1; + } + + /* we calculate position after current block, [size][checksum][data][footer] */ + const uint64_t total_block_size = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + block_size + BLOCK_MANAGER_FOOTER_SIZE; + const uint64_t next_block_pos = cursor->current_pos + total_block_size; + + /* we check against cached file size, if there's no room after this block, we're at last */ + const uint64_t file_size = atomic_load(&cursor->bm->current_file_size); + return (next_block_pos >= file_size) ? 1 : 0; +} + +int block_manager_get_size(block_manager_t *bm, uint64_t *size) +{ + if (!bm || !size) return -1; + *size = atomic_load(&bm->current_file_size); + return 0; +} + +int block_manager_cursor_goto(block_manager_cursor_t *cursor, const uint64_t pos) +{ + if (!cursor) return -1; + + cursor->current_pos = pos; + cursor->block_size_valid = 0; /* we invalidate cache when jumping to arbitrary position */ + cursor->block_index = -1; /* index is unknown after an arbitrary jump */ + return 0; +} + +int block_manager_escalate_fsync(block_manager_t *bm) +{ + if (!bm) return -1; + return fdatasync(bm->fd); +} + +time_t block_manager_last_modified(block_manager_t *bm) +{ + if (!bm) return -1; + + struct STAT_STRUCT st; + if (STAT_FUNC(bm->file_path, &st) != 0) return -1; + return st.st_mtime; +} + +int block_manager_count_blocks(block_manager_t *bm) +{ + if (!bm) return -1; + + const uint64_t file_size = atomic_load(&bm->current_file_size); + if (file_size <= BLOCK_MANAGER_HEADER_SIZE) return 0; + + set_file_sequential_hint(bm->fd); + + /** buffered scan where we read 64KB chunks so thousands of block headers are parsed per + * syscall. we only need the first 4 bytes of each block (size field) to compute the skip + * distance. */ + enum + { + COUNT_BUF = 64 * 1024 + }; + uint8_t *buf = bm_get_read_buf(COUNT_BUF); + if (!buf) + { + /* fallback to per-block pread via cursor */ + block_manager_cursor_t c; + int n = 0; + (void)block_manager_cursor_init_stack(&c, bm); + while (block_manager_cursor_next(&c) == 0) n++; + return n; + } + + int count = 0; + uint64_t pos = BLOCK_MANAGER_HEADER_SIZE; + + while (pos < file_size) + { + size_t want = COUNT_BUF; + if (pos + want > file_size) want = (size_t)(file_size - pos); + + const ssize_t got = pread(bm->fd, buf, want, (off_t)pos); + if (got < (ssize_t)BLOCK_MANAGER_SIZE_FIELD_SIZE) break; + + size_t off = 0; + while (off + BLOCK_MANAGER_SIZE_FIELD_SIZE <= (size_t)got) + { + const uint32_t bsz = decode_uint32_le_compat(buf + off); + if (bsz == 0) return count; + + const size_t total = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + (size_t)bsz + BLOCK_MANAGER_FOOTER_SIZE; + + if (off + total > (size_t)got) + { + /* block straddles buffer edge, we break to re-read from this block's start */ + break; + } + + off += total; + count++; + } + + /** we advance file position by bytes consumed. + * if off == 0, one block is larger than the buffer, we read its size and skip. */ + if (off == 0) + { + const uint32_t bsz = decode_uint32_le_compat(buf); + pos += BLOCK_MANAGER_BLOCK_HEADER_SIZE + (uint64_t)bsz + BLOCK_MANAGER_FOOTER_SIZE; + count++; + } + else + { + pos += off; + } + } + + return count; +} + +int block_manager_validate_last_block(block_manager_t *bm, + const tidesdb_block_validation_mode_t validation) +{ + if (!bm) return -1; + + uint64_t file_size; + if (get_file_size(bm->fd, &file_size) != 0) return -1; + + atomic_store(&bm->current_file_size, file_size); + atomic_store(&bm->preallocated_size, file_size); + + /* if file is empty, we write header */ + if (file_size == 0) + { + if (write_header(bm->fd) != 0) + { + return -1; + } + if (is_sync_full(bm) && !odsync_available()) + { + fdatasync(bm->fd); + } + return 0; + } + + if (file_size == BLOCK_MANAGER_HEADER_SIZE) + { + return 0; /* valid empty file with header */ + } + + /* we must ensure we have at least header + minimum block */ + const uint64_t min_block_size = BLOCK_MANAGER_BLOCK_HEADER_SIZE + BLOCK_MANAGER_FOOTER_SIZE; + if (file_size < BLOCK_MANAGER_HEADER_SIZE + min_block_size) + { + if (validation == BLOCK_MANAGER_STRICT_BLOCK_VALIDATION) + { + return -1; + } + return truncate_to_header(bm); + } + + /* O(1) validation, we read footer of last block */ + unsigned char footer_buf[BLOCK_MANAGER_FOOTER_SIZE]; + const off_t footer_offset = (off_t)(file_size - BLOCK_MANAGER_FOOTER_SIZE); + const ssize_t n = pread(bm->fd, footer_buf, BLOCK_MANAGER_FOOTER_SIZE, footer_offset); + + if (n != BLOCK_MANAGER_FOOTER_SIZE) + { + if (validation == BLOCK_MANAGER_STRICT_BLOCK_VALIDATION) + { + /* strict mode -- can't read footer = corruption */ + return -1; + } + /* permissive mode -- truncate to header */ + return truncate_to_header(bm); + } + + const uint32_t footer_size = decode_uint32_le_compat(footer_buf); + const uint32_t footer_magic = + decode_uint32_le_compat(footer_buf + BLOCK_MANAGER_CHECKSUM_LENGTH); + + /* we check if footer is valid */ + if (footer_magic != BLOCK_MANAGER_FOOTER_MAGIC) + { + /*** the trailing region might be preallocation tail (zeros from fallocate after + ** the last valid block) rather than corruption. forward-scan to find the actual + * data extent, then check whether the trailing region is all zeros to decide. */ + uint64_t scan_pos = BLOCK_MANAGER_HEADER_SIZE; + uint64_t valid_size = BLOCK_MANAGER_HEADER_SIZE; + int hit_corruption = 0; /* 1 = found non-zero garbage or partial block */ + + while (scan_pos + min_block_size <= file_size) + { + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, (off_t)scan_pos) != + BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + hit_corruption = 1; + break; + } + + const uint32_t block_size = decode_uint32_le_compat(size_buf); + if (block_size == 0) break; /* end of data; tail is either prealloc or hole */ + + const uint64_t total_block_size = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + block_size + BLOCK_MANAGER_FOOTER_SIZE; + if (scan_pos + total_block_size > file_size) + { + hit_corruption = 1; /* declared size overruns file */ + break; + } + + /* we verify footer of this block */ + const off_t block_footer_offset = + (off_t)(scan_pos + total_block_size - BLOCK_MANAGER_FOOTER_SIZE); + if (pread(bm->fd, footer_buf, BLOCK_MANAGER_FOOTER_SIZE, block_footer_offset) != + BLOCK_MANAGER_FOOTER_SIZE) + { + hit_corruption = 1; + break; + } + + const uint32_t block_footer_size = decode_uint32_le_compat(footer_buf); + const uint32_t block_footer_magic = + decode_uint32_le_compat(footer_buf + BLOCK_MANAGER_CHECKSUM_LENGTH); + + if (block_footer_magic != BLOCK_MANAGER_FOOTER_MAGIC || block_footer_size != block_size) + { + hit_corruption = 1; + break; + } + + valid_size = scan_pos + total_block_size; + scan_pos += total_block_size; + } + + /* if we stopped without explicit corruption, verify the trailing region is + * all zeros -- that confirms it's preallocation tail, not a partial write. */ + const int trailing_zero = + hit_corruption ? 0 : is_trailing_zero(bm->fd, valid_size, file_size); + + if (validation == BLOCK_MANAGER_STRICT_BLOCK_VALIDATION) + { + if (hit_corruption || trailing_zero != 1) return -1; + /* preallocation tail is legitimate; don't truncate, just record true extent */ + atomic_store(&bm->current_file_size, valid_size); + return 0; + } + + /* permissive mode -- truncate trailing garbage OR preallocation tail so + * the file is always self-describing on next open */ + if (valid_size != file_size) + { + if (ftruncate(bm->fd, (off_t)valid_size) != 0) return -1; + + if (is_sync_full(bm)) + { + fdatasync(bm->fd); + } + + if (reopen_fd(bm) != 0) return -1; + atomic_store(&bm->current_file_size, valid_size); + atomic_store(&bm->preallocated_size, valid_size); + } + + return 0; + } + + /* the footer magic is valid, we verify size matches header */ + const uint64_t min_required = + (uint64_t)BLOCK_MANAGER_FOOTER_SIZE + footer_size + BLOCK_MANAGER_BLOCK_HEADER_SIZE; + if (file_size < min_required + BLOCK_MANAGER_HEADER_SIZE) + { + if (validation == BLOCK_MANAGER_STRICT_BLOCK_VALIDATION) + { + /*** strict mode -- invalid block position = corruption */ + return -1; + } + /*** permissive mode -- truncate to header */ + return truncate_to_header(bm); + } + + const uint64_t block_start = file_size - min_required; + + unsigned char header_size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + if (pread(bm->fd, header_size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, (off_t)block_start) != + BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + /* we cant read block header = I/O error (fail in both modes) */ + return -1; + } + + const uint32_t header_size = decode_uint32_le_compat(header_size_buf); + if (header_size != footer_size) + { + /* size mismatch = corruption (fail in both modes, this is unrecoverable) */ + return -1; + } + + /* the last block is valid, no truncation needed */ + return 0; +} + +/* + * convert_sync_mode + * converts tidesdb sync mode to block manager sync mode + * @param tdb_sync_mode the tidesdb sync mode + * @return the corresponding block manager sync mode + */ +block_manager_sync_mode_t convert_sync_mode(const int tdb_sync_mode) +{ + switch (tdb_sync_mode) + { + case 0: + return BLOCK_MANAGER_SYNC_NONE; + case 1: + return BLOCK_MANAGER_SYNC_FULL; + default: + return BLOCK_MANAGER_SYNC_NONE; + } +} + +void block_manager_set_sync_mode(block_manager_t *bm, const int sync_mode) +{ + if (!bm) return; + bm->sync_mode = convert_sync_mode(sync_mode); + bm->sync_full_cached = (bm->sync_mode == BLOCK_MANAGER_SYNC_FULL); +} + +int block_manager_get_block_size_at_offset(block_manager_t *bm, const uint64_t offset, + uint32_t *size) +{ + if (!bm || !size) return -1; + + /* we read the size field from block header */ + unsigned char size_buf[BLOCK_MANAGER_SIZE_FIELD_SIZE]; + const ssize_t nread = pread(bm->fd, size_buf, BLOCK_MANAGER_SIZE_FIELD_SIZE, (off_t)offset); + if (nread != BLOCK_MANAGER_SIZE_FIELD_SIZE) + { + return -1; + } + + *size = decode_uint32_le_compat(size_buf); + if (*size == 0) return -1; /* invalid block */ + + return 0; +} + +int block_manager_read_at_offset(block_manager_t *bm, const uint64_t offset, const size_t size, + uint8_t *data) +{ + if (!bm || !data || size == 0) return -1; + + /* we do a simple pread at the specified offset */ + const ssize_t nread = pread(bm->fd, data, size, (off_t)offset); + if (nread != (ssize_t)size) + { + return -1; + } + + return 0; +} + +int block_manager_read_block_data_at_offset(block_manager_t *bm, const uint64_t offset, + uint8_t **data, uint32_t *data_size) +{ + if (!bm || !data || !data_size) return -1; + + /* offset points at a known-good block (vlog lookup), so no extent/budget check; + * the single optimistic pread + checksum verify happen inside the helper */ + uint32_t block_size = 0; + uint8_t *payload = bm_read_block_tls(bm->fd, offset, 0, 0, &block_size); + if (BM_UNLIKELY(!payload)) return -1; + + uint8_t *block_data = malloc(block_size); + if (BM_UNLIKELY(!block_data)) return -1; + + memcpy(block_data, payload, block_size); + *data = block_data; + *data_size = block_size; + return 0; +} + +int block_manager_open(block_manager_t **bm, const char *file_path, const int sync_mode) +{ + if (!bm || !file_path) return -1; + return block_manager_open_internal(bm, file_path, convert_sync_mode(sync_mode)); +} diff --git a/storage/tidesdb/libtidesdb/src/block_manager.h b/storage/tidesdb/libtidesdb/src/block_manager.h new file mode 100644 index 0000000000000..180e6cc4cac86 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/block_manager.h @@ -0,0 +1,541 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __BLOCK_MANAGER_H__ +#define __BLOCK_MANAGER_H__ +#include "compat.h" + +/* max file path length for block manager file(s) */ +#define MAX_FILE_PATH_LENGTH (1024 * 4) + +/* TDB in hex */ +#define BLOCK_MANAGER_MAGIC 0x544442 +/* 3-byte mask for magic number validation */ +#define BLOCK_MANAGER_MAGIC_MASK 0xFFFFFF +#define BLOCK_MANAGER_VERSION 7 + +/* header field sizes */ +/* magic number size in bytes */ +#define BLOCK_MANAGER_MAGIC_SIZE 3 +/* version field size in bytes */ +#define BLOCK_MANAGER_VERSION_SIZE 1 +#define BLOCK_MANAGER_HEADER_SIZE 8 + +/* block field sizes */ +/* block size field (uint32_t) -- supports blocks up to 4GB, though try to keep it under! */ +#define BLOCK_MANAGER_SIZE_FIELD_SIZE 4 +/* xxHash32 = 4 bytes (sufficient for block-level checksums) */ +#define BLOCK_MANAGER_CHECKSUM_LENGTH 4 + +/* block header is now just size + checksum */ +#define BLOCK_MANAGER_BLOCK_HEADER_SIZE \ + (BLOCK_MANAGER_SIZE_FIELD_SIZE + BLOCK_MANAGER_CHECKSUM_LENGTH) + +/* block footer for fast validation -- size + magic */ +#define BLOCK_MANAGER_FOOTER_MAGIC 0x42445442 /* "BTDB" reversed */ +#define BLOCK_MANAGER_FOOTER_SIZE 8 /* 4-byte size + 4-byte magic */ + +/* number of iovecs we emit per block in pwritev ( header, payload, footer ) */ +#define BLOCK_MANAGER_IOVECS_PER_BLOCK 3 + +/* default file permissions (rw-r--r--) */ +#define BLOCK_MANAGER_FILE_MODE 0644 + +/* preallocation tunables -- controls how aggressively we extend on-disk allocation + * ahead of writes to avoid the kernel's file-extending lock on every pwrite. + * extending writes serialize on the per-inode lock (e.g., ext4 i_rwsem) regardless + * of disjoint offsets, so we preallocate in chunks and let pwrites land in-place. */ +#define BLOCK_MANAGER_PREALLOC_CHUNK (64ull * 1024 * 1024) /* extend by 64 MB at a time */ +#define BLOCK_MANAGER_PREALLOC_LOWWATER (4ull * 1024 * 1024) /* trigger extend when 4 MB left */ + +typedef enum +{ + BLOCK_MANAGER_SYNC_NONE, + BLOCK_MANAGER_SYNC_FULL, +} block_manager_sync_mode_t; + +typedef enum +{ + BLOCK_MANAGER_PERMISSIVE_BLOCK_VALIDATION = + 0, /* no error on validation, we truncate to last valid block */ + BLOCK_MANAGER_STRICT_BLOCK_VALIDATION = 1 /* error on validation */ +} tidesdb_block_validation_mode_t; + +/** + * block_manager_t + * block manager struct + * used for block managers in TidesDB + * @param fd the file descriptor the block manager is managing + * @param file_path the path of the file + * @param sync_mode sync mode for this block manager + * @param sync_full_cached cached result of (sync_mode == BLOCK_MANAGER_SYNC_FULL) + * @param current_file_size track file size in memory to avoid syscalls + * @param preallocated_size on-disk allocation high water mark; pwrites within + * [HEADER_SIZE, preallocated_size) avoid extending the file + * and skip the kernel's per-inode write lock fast path. + * set to UINT64_MAX if preallocation is unsupported on this + * platform/fs to disable further attempts. + * @param group_durable_size bytes of the file confirmed fdatasync'd, used by group-commit + * callers to tell whether their write is already durable + * @param group_sync_active set while a group-commit leader is mid-fdatasync on this file + */ +typedef struct +{ + int fd; + char file_path[MAX_FILE_PATH_LENGTH]; + block_manager_sync_mode_t sync_mode; + int sync_full_cached; /* cached result of (sync_mode == BLOCK_MANAGER_SYNC_FULL) */ + /* explicit alignment for atomic uint64_t to avoid ABI issues on 32-bit platforms */ + ATOMIC_ALIGN(8) _Atomic uint64_t current_file_size; + ATOMIC_ALIGN(8) _Atomic uint64_t preallocated_size; + ATOMIC_ALIGN(8) _Atomic uint64_t group_durable_size; + /* atomic so concurrent group-commit leaders don't race on this flag */ + _Atomic int group_sync_active; +} block_manager_t; + +/** + * block_t + * block struct + * used for blocks in TidesDB + * @param size the size of the data in the block + * @param data the data in the block + * @param ref_count atomic reference count for safe concurrent access + * @param inline_data 1 if data is allocated inline with this struct (single allocation) + */ +typedef struct +{ + uint64_t size; + void *data; + _Atomic(uint32_t) ref_count; + uint8_t inline_data; +} block_manager_block_t; + +/** + * block_cursor_t + * block cursor struct + * used for block cursors in TidesDB + * @param bm the block manager + * @param current_pos the current position of the cursor + * @param current_block_size the size of the current block + * @param block_index current index in shared position cache (-1 if before first block) + * @param block_size_valid 1 if current_block_size is cached and valid, 0 otherwise + */ +typedef struct +{ + block_manager_t *bm; + uint64_t current_pos; + uint64_t current_block_size; + int block_index; + int block_size_valid; +} block_manager_cursor_t; + +/** + * block_manager_open + * opens a block manager + * @param bm the block manager to open + * @param file_path the path of the file + * @param sync_mode the sync mode (BLOCK_MANAGER_SYNC_NONE, BLOCK_MANAGER_SYNC_FULL) + * @return 0 if successful, -1 if not + */ +int block_manager_open(block_manager_t **bm, const char *file_path, int sync_mode); + +/** + * block_manager_close + * closes a block manager gracefully + * @param bm the block manager to close + * @return 0 if successful, -1 if not + */ +int block_manager_close(block_manager_t *bm); + +/** + * block_manager_block_create + * creates a new block + * @param size the size of the data in block + * @param data the data to be placed in block + * @return a new block + */ +block_manager_block_t *block_manager_block_create(uint64_t size, const void *data); + +/** + * block_manager_block_create_from_buffer + * creates a new block taking ownership of buffer (no copy) + * @param size the size of the data in block + * @param data the data buffer (will be freed with block) + * @return a new block + */ +block_manager_block_t *block_manager_block_create_from_buffer(uint64_t size, void *data); + +/** + * block_manager_block_write + * @param bm the block manager to write the block to + * @param block the block to write + * @return block offset if successful, -1 if not + */ +int64_t block_manager_block_write(block_manager_t *bm, block_manager_block_t *block); + +/** + * block_manager_write_raw + * write raw data directly to the block manager without allocating a block_manager_block_t. + * avoids the malloc/memcpy/free cycle of block_create + block_write + block_release. + * the data pointer only needs to be valid during this call. + * @param bm the block manager + * @param data pointer to the data to write + * @param size size of the data in bytes + * @return the offset where the block was written, or -1 on failure + */ +int64_t block_manager_write_raw(block_manager_t *bm, const void *data, uint32_t size); + +/** + * block_manager_block_write_batch + * writes multiple blocks in a single I/O operation for better performance + * @param bm the block manager to write the blocks to + * @param blocks array of blocks to write + * @param count number of blocks + * @param offsets output array for block offsets (must be pre-allocated with count elements) + * @return number of successfully written blocks, -1 on critical failure + */ +int block_manager_block_write_batch(block_manager_t *bm, block_manager_block_t **blocks, + size_t count, int64_t *offsets); + +/** + * block_manager_write_at + * writes raw bytes at a specific offset (for patching existing data) + * WARNING: use with care -- this bypasses block checksums + * @param bm the block manager + * @param offset the file offset to write at + * @param data the data to write + * @param size the size of data to write + * @return 0 if successful, -1 if not + */ +int block_manager_write_at(block_manager_t *bm, int64_t offset, const uint8_t *data, size_t size); + +/** + * block_manager_update_checksum + * recalculates and updates the checksum of a block after in-place modification + * use this after block_manager_write_at to fix the checksum + * @param bm the block manager + * @param block_offset the file offset of the block (start of block header) + * @return 0 if successful, -1 if not + */ +int block_manager_update_checksum(block_manager_t *bm, int64_t block_offset); + +/** + * block_manager_block_free + * frees a block + * @param block the block to free + */ +void block_manager_block_free(block_manager_block_t *block); + +/** + * block_manager_block_acquire + * increments reference count for a block + * @param block the block to acquire + * @return 1 if successful, 0 if block is being freed + */ +int block_manager_block_acquire(block_manager_block_t *block); + +/** + * block_manager_block_release + * decrements reference count and frees block when count reaches 0 + * @param block the block to release + */ +void block_manager_block_release(block_manager_block_t *block); + +/** + * block_manager_cursor_init + * initializes a block manager cursor (heap allocated) + * @param cursor the cursor to initialize + * @param bm the block manager to initialize the cursor on + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_init(block_manager_cursor_t **cursor, block_manager_t *bm); + +/** + * block_manager_cursor_init_stack + * initializes a pre-allocated block manager cursor (stack or caller-allocated) + * avoids heap allocation in hot paths + * @param cursor pointer to pre-allocated cursor struct + * @param bm the block manager to initialize the cursor on + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_init_stack(block_manager_cursor_t *cursor, block_manager_t *bm); + +/** + * cursor_next + * moves the cursor to the next block + * @param cursor the cursor to move + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_next(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_read + * reads the block at the cursor current position + * @param cursor the cursor to read from + * @return the block read from the cursor + */ +block_manager_block_t *block_manager_cursor_read(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_read_partial + * reads only the first max_bytes of a block at cursor position + * useful for reading header+key without reading large values + * @param cursor the cursor to read from + * @param max_bytes maximum bytes to read (0 = read full block) + * @return the partial block read from the cursor + */ +block_manager_block_t *block_manager_cursor_read_partial(block_manager_cursor_t *cursor, + size_t max_bytes); + +/** + * block_manager_cursor_read_and_advance + * reads the block at cursor position and advances cursor to next block in one operation + * this is more efficient than separate read + next calls as it avoids redundant pread + * @param cursor the cursor to read from and advance + * @return the block read from the cursor, NULL on error or EOF + */ +block_manager_block_t *block_manager_cursor_read_and_advance(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_free + * frees a cursor + * @param cursor the cursor to free + */ +void block_manager_cursor_free(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_prev + * moves the cursor to the previous block + * @param cursor the cursor to move + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_prev(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_skip_corrupt + * advances the cursor past a partially-written block at the current position. + * + * distinguishes two failure modes: + * partial write (size > 0, footer magic absent); advances cursor, returns 0. + * genuine corruption (size > 0, footer magic valid but checksum bad), returns -1. + * zero-filled hole (size == 0) -- cannot determine block extent, returns -1. + * + * only call after block_manager_cursor_read returns NULL to attempt recovery. + * @param cursor the cursor positioned at the suspect block + * @return 0 if cursor was advanced past a partial write, -1 otherwise + */ +int block_manager_cursor_skip_corrupt(block_manager_cursor_t *cursor); + +/** + * block_manager_truncate + * truncates a block manager to 0 removing all blocks + * @param bm the block manager to truncate + * @return 0 if successful, -1 if not + */ +int block_manager_truncate(block_manager_t *bm); + +/** + * block_manager_last_modified + * gets the last modified time of a block manager file + * @param bm the block manager to get the last modified time of + * @return the last modified time of the block manager + */ +time_t block_manager_last_modified(block_manager_t *bm); + +/** + * block_manager_count_blocks + * counts the number of blocks in a block managed file + * @param bm the block manager to count the blocks of + * @return the number of blocks in the block manager + */ +int block_manager_count_blocks(block_manager_t *bm); + +/** + * block_manager_cursor_has_next + * checks if the cursor has a next block + * @param cursor the cursor to check + * @return 1 if the cursor has a next block, 0 if not. Can return -1 if error + */ +int block_manager_cursor_has_next(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_has_prev + * checks if the cursor has a previous block + * @param cursor the cursor to check + * @return 1 if the cursor has a previous block, 0 if not. Can return -1 if error + */ +int block_manager_cursor_has_prev(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_goto_last + * moves the cursor to the last block + * @param cursor the cursor to move + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_goto_last(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_goto_last_before + * moves the cursor to the last block whose footer ends at end_offset, using + * footer-based O(1) positioning. lets callers seek to the last block of a + * logical region (e.g. an sstable's data blocks) without walking past + * trailing blocks appended after that region. + * @param cursor the cursor to move + * @param end_offset byte offset immediately after the target block's footer + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_goto_last_before(block_manager_cursor_t *cursor, uint64_t end_offset); + +/** + * block_manager_cursor_goto + * moves the cursor to a specific block + * @param cursor the cursor to move + * @param pos the position to move the cursor to + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_goto(block_manager_cursor_t *cursor, uint64_t pos); + +/** + * block_manager_cursor_goto_first + * moves the cursor to the first block + * @param cursor the cursor to move + * @return 0 if successful, -1 if not + */ +int block_manager_cursor_goto_first(block_manager_cursor_t *cursor); + +/** + * block_manager_get_size + * gets the total size of a block manager file + * @param bm the block manager to get the size of + * @param size the size of the block manager + * @return 0 if successful, -1 if not + */ +int block_manager_get_size(block_manager_t *bm, uint64_t *size); + +/** + * block_manager_escalate_fsync + * escalates an fsync syscall to the underlying block manager file + * @param bm the block manager to fsync + * @return 0 if successful, -1 if not + */ +int block_manager_escalate_fsync(block_manager_t *bm); + +/** + * block_manager_cursor_at_last + * checks if the cursor is at the last block + * @param cursor the cursor to check + * @return 1 if the cursor is at the last block, 0 if not. can return -1 if error + */ +int block_manager_cursor_at_last(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_at_first + * checks if the cursor is at the first block + * @param cursor the cursor to check + * @return 1 if the cursor is at the first block, 0 if not. can return -1 if error + */ +int block_manager_cursor_at_first(block_manager_cursor_t *cursor); + +/** + * block_manager_cursor_at_second + * checks if the cursor is at the second block from start + * @param cursor the cursor to check + * @return 1 if the cursor is at the second block, 0 if not. can return -1 if error + */ +int block_manager_cursor_at_second(block_manager_cursor_t *cursor); + +/** + * block_manager_validate_last_block + * validates the integrity of the last block in a block manager file + * @param bm the block manager + * @param validation the type of validation to apply, either strict or permissive + * @return 0 if valid or successfully recovered, -1 if validation fails + * + * In strict mode -- any corruption returns -1, file is not modified + * In permissive mode -- truncates to last valid block on corruption + */ +int block_manager_validate_last_block(block_manager_t *bm, + tidesdb_block_validation_mode_t validation); + +/** + * block_manager_set_max_safe_block_bytes + * sets a process-wide upper bound (bytes) on the size of a single block the + * reader will allocate. a block whose claimed size exceeds this budget is + * refused with a warning instead of allocating (graceful degradation, not OOM). + * pushed down from the tidesdb layer (derived from resolved_memory_limit) so the + * read path never makes a memory syscall. 0 disables the memory-based refusal. + * @param bytes the budget in bytes, or 0 to disable + */ +void block_manager_set_max_safe_block_bytes(uint64_t bytes); + +/** + * convert_sync_mode + * converts TidesDB sync mode enum values to block manager sync mode enum values + * this method provides compatibility between the public TidesDB API (which uses + * TDB_SYNC_NONE/TDB_SYNC_FULL) and the internal block manager API (which uses + * BLOCK_MANAGER_SYNC_NONE/BLOCK_MANAGER_SYNC_FULL) + * @param tdb_sync_mode the TidesDB sync mode (TDB_SYNC_NONE=0, TDB_SYNC_FULL=1) + * @return the corresponding block manager sync mode enum value + */ +block_manager_sync_mode_t convert_sync_mode(int tdb_sync_mode); + +/** + * block_manager_set_sync_mode + * updates the sync mode of an existing block manager + * @param bm the block manager to update + * @param sync_mode the new sync mode (TDB_SYNC_NONE=0, TDB_SYNC_FULL=1) + */ +void block_manager_set_sync_mode(block_manager_t *bm, int sync_mode); + +/** + * block_manager_get_block_size_at_offset + * reads the size of a block at a specific file offset + * useful for determining allocation size before reading block data + * @param bm the block manager to read from + * @param offset the file offset of the block (start of block header) + * @param size output parameter for block data size (not including header) + * @return 0 if successful, -1 if not + */ +int block_manager_get_block_size_at_offset(block_manager_t *bm, uint64_t offset, uint32_t *size); + +/** + * block_manager_read_at_offset + * reads data at a specific file offset (not block-aligned) + * useful for reading values from vlog where offset points to data within a block + * @param bm the block manager to read from + * @param offset the file offset to read from (absolute position in file) + * @param size the number of bytes to read + * @param data output buffer (caller must allocate) + * @return 0 if successful, -1 if not + */ +int block_manager_read_at_offset(block_manager_t *bm, uint64_t offset, size_t size, uint8_t *data); + +/** + * block_manager_read_block_data_at_offset + * reads a complete block (header + data) at a specific file offset in one I/O operation + * optimized for vlog reads -- combines size lookup and data read into single pread + * @param bm the block manager to read from + * @param offset the file offset of the block (start of block header) + * @param data output buffer pointer (allocated by function, caller must free) + * @param data_size output parameter for actual data size (not including header) + * @return 0 if successful, -1 if not + */ +int block_manager_read_block_data_at_offset(block_manager_t *bm, uint64_t offset, uint8_t **data, + uint32_t *data_size); + +#endif /* __BLOCK_MANAGER_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/bloom_filter.c b/storage/tidesdb/libtidesdb/src/bloom_filter.c new file mode 100644 index 0000000000000..390324cdbb696 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/bloom_filter.c @@ -0,0 +1,552 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bloom_filter.h" + +#include +#include + +#define BF_UNLIKELY(x) TDB_UNLIKELY(x) +#define BF_LIKELY(x) TDB_LIKELY(x) + +/* bit manipulation macros for packed bitset */ +#define BF_BITS_PER_WORD 64 +#define BF_WORD_INDEX(bit) ((bit) / BF_BITS_PER_WORD) +#define BF_BIT_INDEX(bit) ((bit) % BF_BITS_PER_WORD) +#define BF_SET_BIT(bitset, bit) ((bitset)[BF_WORD_INDEX(bit)] |= (1ULL << BF_BIT_INDEX(bit))) +#define BF_GET_BIT(bitset, bit) (((bitset)[BF_WORD_INDEX(bit)] >> BF_BIT_INDEX(bit)) & 1ULL) + +/* hash mixing prime (murmur-family). chosen for good avalanche behavior in + * the multiplicative mix below. */ +#define BF_HASH_PRIME 0xc6a4a793u + +/* index-derivation hash versions. version 1 is the original hash. version 2 + * appends a murmur3 fmix32 finalizer so short keys fully avalanche, which + * decorrelates h1/h2 and lowers the false-positive rate on small structured + * keys. the version is stored per filter; a filter is always queried with the + * same hash that built it, so existing on-disk (v1) filters stay correct. */ +#define BF_HASH_VERSION_LEGACY 1u +#define BF_HASH_VERSION_CURRENT 2u +/* serialized v2 filters carry a 0x00 sentinel + version byte. a v1 filter can + * never start with 0x00 because its first field is varint32(m) and m >= 1. */ +#define BF_SERIALIZE_VERSION_SENTINEL 0x00u +#define BF_SERIALIZE_VERSION_BYTES 2 + +/* upper bound on the number of hash functions accepted by bloom_filter_new. + * derived h grows logarithmically with target false-positive rate; even at + * p = 1e-30 the formula yields h ~ 100, so this is a generous sanity ceiling + * to reject pathological configs (negative or absurdly large values from + * floating-point edge cases). typical real-world h is 7-15. */ +#define BF_MAX_HASH_FUNCTIONS 100 + +/* varint worst-case sizes for serialization buffer math */ +#define BF_VARINT32_MAX_BYTES 5 +#define BF_VARINT64_MAX_BYTES 10 +/* serialized header is 3 varint32s -- m, h, non_zero_count */ +#define BF_SERIALIZE_HEADER_MAX_BYTES (3 * BF_VARINT32_MAX_BYTES) +/* each non-zero word is encoded as varint32 index + varint64 value */ +#define BF_SERIALIZE_WORD_MAX_BYTES (BF_VARINT32_MAX_BYTES + BF_VARINT64_MAX_BYTES) + +/* lemire's fast range reduction maps a uniform uint32_t hash into [0, range) + * without integer division. it uses a single 64-bit multiply + shift. + * not a true modulo but produces a uniform distribution, which is all + * a bloom filter needs. */ +static inline uint32_t bf_fast_range(const uint32_t hash, const uint32_t range) +{ + return (uint32_t)(((uint64_t)hash * (uint64_t)range) >> 32); +} + +/** + * bf_hash_inline + * static inline version of bloom_filter_hash for internal use + * allows compiler to inline in hot paths (add/contains) + */ +static inline uint32_t bf_hash_inline(const uint8_t *entry, const size_t size, const uint32_t seed) +{ + const uint32_t prime = BF_HASH_PRIME; + const uint8_t *limit = entry + size; + uint32_t h = seed ^ ((uint32_t)size * prime); + +#if UINTPTR_MAX == UINT64_MAX + while (entry + 8 <= limit) + { + uint32_t w1, w2; + memcpy(&w1, entry, sizeof(w1)); + memcpy(&w2, entry + 4, sizeof(w2)); + entry += 8; + h += w1; + h *= prime; + h ^= (h >> 16); + h += w2; + h *= prime; + h ^= (h >> 16); + } + if (entry + 4 <= limit) + { + uint32_t w; + memcpy(&w, entry, sizeof(w)); + entry += 4; + h += w; + h *= prime; + h ^= (h >> 16); + } +#else + while (entry + 4 <= limit) + { + uint32_t w; + memcpy(&w, entry, sizeof(w)); + entry += 4; + h += w; + h *= prime; + h ^= (h >> 16); + } +#endif + + switch (limit - entry) + { + case 3: + h += (uint32_t)entry[2] << 16; + /* fall through */ + case 2: + h += (uint32_t)entry[1] << 8; + /* fall through */ + case 1: + h += entry[0]; + h *= prime; + h ^= (h >> 24); + break; + default: + break; + } + return h; +} + +/* murmur3 fmix32 -- full-avalanche finalizer. applied by the v2 hash so even a + * short key whose base hash had weak mixing produces well-spread index bits. */ +static inline uint32_t bf_fmix32(uint32_t h) +{ + h ^= h >> 16; + h *= 0x85ebca6bu; + h ^= h >> 13; + h *= 0xc2b2ae35u; + h ^= h >> 16; + return h; +} + +/* v2 index hash base hash plus the fmix32 finalizer */ +static inline uint32_t bf_hash_v2_inline(const uint8_t *entry, const size_t size, + const uint32_t seed) +{ + return bf_fmix32(bf_hash_inline(entry, size, seed)); +} + +/* derive the two base hashes for a filter using the hash version it was built with, + * so a filter is always queried with the same scheme that set its bits */ +static inline void bf_derive_hashes(const bloom_filter_t *bf, const uint8_t *entry, + const size_t size, uint32_t *h1, uint32_t *h2) +{ + if (bf->hash_version >= BF_HASH_VERSION_CURRENT) + { + *h1 = bf_hash_v2_inline(entry, size, 0); + *h2 = bf_hash_v2_inline(entry, size, 1); + } + else + { + *h1 = bf_hash_inline(entry, size, 0); + *h2 = bf_hash_inline(entry, size, 1); + } +} + +int bloom_filter_new(bloom_filter_t **bf, double p, const int n) +{ + /* reject non-finite p explicitly -- a NaN slips past the range comparisons + * (all false for NaN) and would reach an undefined (unsigned)NaN cast below */ + if (!isfinite(p) || p <= 0.0 || p >= 1.0 || n <= 0) + { + return -1; + } + + *bf = malloc(sizeof(bloom_filter_t)); + if (*bf == NULL) + { + return -1; + } + + /**** we calculate the size of the bitset (m) using the formula + *** m = -n * ln(p) / (ln(2)^2) + ** + */ + const double m_double = ceil(-((double)n) * log(p) / (M_LN2 * M_LN2)); + + /* we validate m is within valid range */ + if (m_double <= 0.0 || m_double > (double)UINT32_MAX) + { + free(*bf); + *bf = NULL; + return -1; + } + + (*bf)->m = (unsigned int)m_double; + + /* we calculate the number of hash functions (h) using the formula + * h = (m / n) * ln(2) + * + */ + const double h_double = ceil(((double)(*bf)->m) / n * M_LN2); + + /* we validate h is reasonable -- typical real-world values are 7-15; + * BF_MAX_HASH_FUNCTIONS rejects pathological configs from FP edge cases */ + if (h_double <= 0.0 || h_double > (double)BF_MAX_HASH_FUNCTIONS) + { + free(*bf); + *bf = NULL; + return -1; + } + + (*bf)->h = (unsigned int)h_double; + + /* we calculate number of 64-bit words needed for packed bitset */ + (*bf)->size_in_words = ((*bf)->m + BF_BITS_PER_WORD - 1) / BF_BITS_PER_WORD; + + /* we validate size_in_words to prevent overflow */ + if ((*bf)->size_in_words == 0 || (*bf)->size_in_words > UINT32_MAX / sizeof(uint64_t)) + { + free(*bf); + *bf = NULL; + return -1; + } + + /* we alloc memory for the packed bitset and initialize it to 0 */ + (*bf)->bitset = calloc((size_t)(*bf)->size_in_words, sizeof(uint64_t)); + if ((*bf)->bitset == NULL) + { + free(*bf); + *bf = NULL; + return -1; + } + + /* freshly built filters use the current (best) index hash */ + (*bf)->hash_version = BF_HASH_VERSION_CURRENT; + + return 0; +} + +void bloom_filter_add(const bloom_filter_t *bf, const uint8_t *entry, const size_t size) +{ + if (BF_UNLIKELY(bf == NULL)) return; + if (BF_UNLIKELY(entry == NULL || size == 0)) return; + + /* we cache struct fields to avoid repeated memory access */ + const unsigned int h = bf->h; + const unsigned int m = bf->m; + uint64_t *const bitset = bf->bitset; + + uint32_t h1, h2; + bf_derive_hashes(bf, entry, size, &h1, &h2); + + for (unsigned int i = 0; i < h; i++) + { + const uint32_t hash = h1 + i * h2; + const uint32_t index = bf_fast_range(hash, m); + BF_SET_BIT(bitset, index); + } +} + +int bloom_filter_contains(const bloom_filter_t *bf, const uint8_t *entry, const size_t size) +{ + if (BF_UNLIKELY(bf == NULL)) return -1; + if (BF_UNLIKELY(entry == NULL || size == 0)) return -1; + + /* we cache struct fields to avoid repeated memory access */ + const unsigned int h = bf->h; + const unsigned int m = bf->m; + const uint64_t *const bitset = bf->bitset; + + /* k-mitzenmacher + fast range reduction + * 2 hashes + h cheap probes instead of h full hashes + h divisions */ + uint32_t h1, h2; + bf_derive_hashes(bf, entry, size, &h1, &h2); + + for (unsigned int i = 0; i < h; i++) + { + const uint32_t hash = h1 + i * h2; + const uint32_t index = bf_fast_range(hash, m); + if (BF_LIKELY(!BF_GET_BIT(bitset, index))) + { + return 0; /* definitely not in set */ + } + } + return 1; /* probably in set */ +} + +int bloom_filter_is_full(const bloom_filter_t *bf) +{ + if (BF_UNLIKELY(bf == NULL)) return -1; + if (BF_UNLIKELY(bf->bitset == NULL)) return -1; + + const uint64_t *const bitset = bf->bitset; + const unsigned int size_in_words = bf->size_in_words; + + /*** prevents `size_in_words - 1` from underflowing as unsigned. + ** the constructor rejects size_in_words == 0, but a future refactor or a + * deserialized filter that bypasses the constructor could produce one. */ + if (BF_UNLIKELY(size_in_words == 0)) return -1; + + /* we check if all words are fully set */ + for (unsigned int i = 0; i < size_in_words - 1; i++) + { + if (bitset[i] != UINT64_MAX) + { + return 0; + } + } + + /* we check last word (may be partial) */ + const unsigned int remaining_bits = bf->m % BF_BITS_PER_WORD; + if (remaining_bits == 0) + { + return (bitset[size_in_words - 1] == UINT64_MAX); + } + const uint64_t mask = (1ULL << remaining_bits) - 1; + return ((bitset[size_in_words - 1] & mask) == mask); +} + +unsigned int bloom_filter_hash(const uint8_t *entry, const size_t size, const int seed) +{ + if (BF_UNLIKELY(entry == NULL || size == 0)) return 0; + + return bf_hash_inline(entry, size, (uint32_t)seed); +} + +uint8_t *bloom_filter_serialize(const bloom_filter_t *bf, size_t *out_size) +{ + if (bf == NULL) + { + return NULL; + } + + /* we count non-zero words for sparse encoding */ + unsigned int non_zero_count = 0; + for (unsigned int i = 0; i < bf->size_in_words; i++) + { + if (bf->bitset[i] != 0) non_zero_count++; + } + + /* we allocate worst-case size + * -- header 3 varint32s (m, h, non_zero_count) + * -- sparse data each non-zero word = varint32 index + varint64 value + */ + const size_t max_size = BF_SERIALIZE_VERSION_BYTES + BF_SERIALIZE_HEADER_MAX_BYTES + + (size_t)non_zero_count * BF_SERIALIZE_WORD_MAX_BYTES; + uint8_t *buffer = malloc(max_size); + if (buffer == NULL) + { + return NULL; + } + + uint8_t *ptr = buffer; + + /* any non-legacy filter leads with a 0x00 sentinel (impossible for a v1 filter, + * whose first byte is varint32(m) with m >= 1) followed by the hash version + * byte, so deserialize routes the filter back to the hash that built it. keyed + * off "> LEGACY" rather than a specific version so a future bump stays recorded. */ + if (bf->hash_version > BF_HASH_VERSION_LEGACY) + { + *ptr++ = BF_SERIALIZE_VERSION_SENTINEL; + *ptr++ = (uint8_t)bf->hash_version; + } + + /* we write header with varint encoding */ + ptr = encode_varint32(ptr, (uint32_t)bf->m); + ptr = encode_varint32(ptr, (uint32_t)bf->h); + ptr = encode_varint32(ptr, (uint32_t)non_zero_count); + + /* we write sparse bitset -- only non-zero words with their indices */ + for (unsigned int i = 0; i < bf->size_in_words; i++) + { + if (bf->bitset[i] != 0) + { + ptr = encode_varint32(ptr, (uint32_t)i); /* word index */ + ptr = encode_varint64(ptr, bf->bitset[i]); /* word value */ + } + } + + /* we return actual size used, no realloc shrink since the overallocation + * is at most 15 bytes per non-zero word and glibc typically won't release it anyway */ + *out_size = ptr - buffer; + return buffer; +} + +/* bounded varint decoders -- read at most the bytes a 32/64-bit value can occupy + * and never past `end`. return 0 and advance *pp on success, -1 on truncation or a + * malformed (unterminated) varint. these replace the unbounded compat decoders on + * the parse-untrusted-bytes path so a corrupt buffer cannot drive an over-read. */ +static int bf_get_varint32(const uint8_t **pp, const uint8_t *end, uint32_t *out) +{ + uint32_t result = 0; + int shift = 0; + const uint8_t *p = *pp; + for (int i = 0; i < BF_VARINT32_MAX_BYTES; i++) + { + if (p >= end) return -1; + const uint8_t b = *p++; + result |= (uint32_t)(b & 0x7Fu) << shift; + if (!(b & 0x80u)) + { + *pp = p; + *out = result; + return 0; + } + shift += 7; + } + return -1; /* no terminator within the max byte budget */ +} + +static int bf_get_varint64(const uint8_t **pp, const uint8_t *end, uint64_t *out) +{ + uint64_t result = 0; + int shift = 0; + const uint8_t *p = *pp; + for (int i = 0; i < BF_VARINT64_MAX_BYTES; i++) + { + if (p >= end) return -1; + const uint8_t b = *p++; + result |= (uint64_t)(b & 0x7Fu) << shift; + if (!(b & 0x80u)) + { + *pp = p; + *out = result; + return 0; + } + shift += 7; + } + return -1; +} + +bloom_filter_t *bloom_filter_deserialize(const uint8_t *data, const size_t len) +{ + if (data == NULL || len == 0) + { + return NULL; + } + + const uint8_t *ptr = data; + const uint8_t *const end = data + len; + + /* a leading 0x00 marks the versioned format (v1 can never start with 0x00, + * its first field is varint32(m) with m >= 1). absent it, this is a legacy + * v1 filter that must keep being queried with the v1 hash. */ + unsigned int hash_version = BF_HASH_VERSION_LEGACY; + if (ptr[0] == BF_SERIALIZE_VERSION_SENTINEL) + { + if (end - ptr < BF_SERIALIZE_VERSION_BYTES) return NULL; /* sentinel + version */ + ptr++; /* skip sentinel */ + hash_version = (unsigned int)*ptr++; /* read hash version */ + /* reject an unknown version -- querying with an undefined scheme would + * silently produce false negatives on an otherwise valid filter */ + if (hash_version < BF_HASH_VERSION_LEGACY || hash_version > BF_HASH_VERSION_CURRENT) + { + return NULL; + } + } + + /* we read header with bounded varint decoding */ + uint32_t m_u32, h_u32, non_zero_count; + if (bf_get_varint32(&ptr, end, &m_u32) != 0) return NULL; + if (bf_get_varint32(&ptr, end, &h_u32) != 0) return NULL; + if (bf_get_varint32(&ptr, end, &non_zero_count) != 0) return NULL; + + const unsigned int m = m_u32; + const unsigned int h = h_u32; + + /* we validate deserialized values */ + if (m == 0 || h == 0) + { + return NULL; + } + + /* we check for potential integer overflow in size calculation */ + if (m > UINT32_MAX - BF_BITS_PER_WORD) + { + return NULL; + } + + const unsigned int size_in_words = (m + BF_BITS_PER_WORD - 1) / BF_BITS_PER_WORD; + + /* a valid filter never has more non-zero words than total words; reject a + * corrupt count up front so the loop below can't be driven past the buffer */ + if (non_zero_count > size_in_words) + { + return NULL; + } + + /* we allocate and zero-initialize bitset */ + uint64_t *bitset = calloc((size_t)size_in_words, sizeof(uint64_t)); + if (bitset == NULL) + { + return NULL; + } + + /* we read sparse bitset -- only non-zero words */ + for (uint32_t i = 0; i < non_zero_count; i++) + { + uint32_t index; + uint64_t value; + if (bf_get_varint32(&ptr, end, &index) != 0 || bf_get_varint64(&ptr, end, &value) != 0) + { + free(bitset); + return NULL; + } + + /* we validate index is within bounds */ + if (index >= (uint32_t)size_in_words) + { + free(bitset); + return NULL; + } + + bitset[index] = value; + } + + bloom_filter_t *bf = malloc(sizeof(bloom_filter_t)); + if (bf == NULL) + { + free(bitset); + return NULL; + } + + bf->m = m; + bf->h = h; + bf->bitset = bitset; + bf->size_in_words = size_in_words; + bf->hash_version = hash_version; + + return bf; +} + +void bloom_filter_free(bloom_filter_t *bf) +{ + if (bf == NULL) + { + return; + } + + free(bf->bitset); + free(bf); +} diff --git a/storage/tidesdb/libtidesdb/src/bloom_filter.h b/storage/tidesdb/libtidesdb/src/bloom_filter.h new file mode 100644 index 0000000000000..b2a22ea4842f7 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/bloom_filter.h @@ -0,0 +1,125 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __BLOOM_FILTER_H__ +#define __BLOOM_FILTER_H__ +#include "compat.h" + +/** + * bloom_filter_t + * bloom filter struct (optimized with packed bits) + * @param bitset the bloom filter bitset (packed in uint64_t words) + * @param m the size of the bloom filter in bits + * @param h the number of hash functions + * @param size_in_words number of uint64_t words in bitset + * @param hash_version index-derivation hash version 1 = legacy, 2 = fmix-finalized + * (better avalanche / lower FPR on short keys). carried with the + * filter and honored by add/contains so on-disk filters built with + * an older hash keep querying with that same hash (no false negatives). + * + * a filter is single-writer during build (add) and immutable after. + * once frozen it may be queried (contains) concurrently by any number of threads -- + * the query path is pure-read. add() concurrent with add()/contains() is a data race + * (the bitset words are non-atomic read-modify-write) and is not supported. + */ +typedef struct +{ + uint64_t *bitset; + unsigned int m; + unsigned int h; + unsigned int size_in_words; + unsigned int hash_version; +} bloom_filter_t; + +/** + * bloom_filter_new + * creates a new bloom filter + * @param bf the bloom filter to create + * @param p the false positive rate + * @param n the number of elements + * @return 0 if successful, -1 if not + */ +int bloom_filter_new(bloom_filter_t **bf, double p, int n); + +/** + * bloom_filter_add + * adds an entry to the bloom filter + * @param bf the bloom filter to add to + * @param entry the entry to add + * @param size the size of the entry + */ +void bloom_filter_add(const bloom_filter_t *bf, const uint8_t *entry, size_t size); + +/** + * bloom_filter_contains + * checks if an entry is in the bloom filter + * @param bf the bloom filter to check + * @param entry the entry to check + * @param size the size of the entry + * @return 1 if the entry is in the bloom filter, 0 if not + */ +int bloom_filter_contains(const bloom_filter_t *bf, const uint8_t *entry, size_t size); + +/** + * bloom_filter_is_full + * checks if the bloom filter is full + * @param bf the bloom filter to check + * @return 1 if the bloom filter is full, 0 if not + */ +int bloom_filter_is_full(const bloom_filter_t *bf); + +/** + * bloom_filter_hash + * hashes an entry + * @param entry the entry to hash + * @param size the size of the entry + * @param seed the seed for the hash + * @return the hash + */ +unsigned int bloom_filter_hash(const uint8_t *entry, size_t size, int seed); + +/** + * bloom_filter_serialize + * serializes a bloom filter to compact binary format using: + * -- varint encoding for header fields (m, h, non_zero_count) + * -- sparse encoding -- only stores non-zero words with their indices + * typical space savings -- 70-90% for low fill rates (< 50%) + * @param bf the bloom filter to serialize + * @param out_size the size of the serialized bloom filter + * @return the serialized bloom filter + */ +uint8_t *bloom_filter_serialize(const bloom_filter_t *bf, size_t *out_size); + +/** + * bloom_filter_deserialize + * deserializes a bloom filter. every field read is bounded by len, so a + * truncated or corrupt buffer is rejected (NULL) rather than over-read. + * @param data the serialized bloom filter + * @param len the length in bytes of the serialized buffer + * @return the deserialized bloom filter, or NULL on malformed/truncated input + */ +bloom_filter_t *bloom_filter_deserialize(const uint8_t *data, size_t len); + +/** + * bloom_filter_free + * frees a bloom filter + * @param bf the bloom filter to free + */ +void bloom_filter_free(bloom_filter_t *bf); + +#endif /* __BLOOM_FILTER_H__ */ \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/btree.c b/storage/tidesdb/libtidesdb/src/btree.c new file mode 100644 index 0000000000000..6538aa636a72e --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/btree.c @@ -0,0 +1,3003 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "btree.h" + +#include + +#include "compress.h" +#include "xxhash.h" + +/* arena alignment in bytes -- every allocation is rounded up so unaligned typed access + * inside an arena slot is safe on platforms that fault on misaligned uint64_t loads */ +#define BTREE_ARENA_ALIGNMENT 8 + +/* upper bound on hex digits for a uint64 -- 16 nibbles, used as the local stack buffer + * by btree_u64_to_hex when building a cache key */ +#define BTREE_U64_HEX_MAX 16 + +/* initial entry capacity of a pending leaf during btree construction; the array doubles + * on overflow so this only sets the smallest meaningful allocation */ +#define BTREE_PENDING_LEAF_INITIAL_CAP 64 + +/* small malloc safety pads added on top of the precomputed est_size in the leaf and + * internal-node serializers, to absorb any conservative undercount without realloc */ +#define BTREE_LEAF_SERIALIZE_SAFETY_PAD 64 +#define BTREE_INTERNAL_SERIALIZE_SAFETY_PAD 32 + +/* fixed-size empty-leaf encoding -- type byte, num_entries=0 varint, prev/next int64 */ +#define BTREE_LEAF_EMPTY_BUF_SIZE 32 + +/* suffix for the temp file uncompressed leaves are staged into before compression */ +#define BTREE_LEAF_STAGE_SUFFIX ".lstmp" + +/* compressed-node block layout written by btree_node_serialize_with_compression and read + * back by btree_node_read_with_compression. format is + * [original_size:u32][prev_offset:i64][next_offset:i64][compressed_data] */ +#define BTREE_COMPRESSED_NODE_PREV_OFF 4 +#define BTREE_COMPRESSED_NODE_NEXT_OFF 12 +#define BTREE_COMPRESSED_NODE_HEADER_SIZE 20 + +/** + * varint encoding utilities + * uses LEB128-style encoding -- 7 bits per byte, high bit = continuation + */ + +/** + * btree_varint_size + * returns the size of a varint encoding for a given value + * @param val the value to encode + * @return the size of the varint encoding + */ +static inline size_t btree_varint_size(const uint64_t val) +{ + if (val < (1ULL << 7)) return 1; + if (val < (1ULL << 14)) return 2; + if (val < (1ULL << 21)) return 3; + if (val < (1ULL << 28)) return 4; + if (val < (1ULL << 35)) return 5; + if (val < (1ULL << 42)) return 6; + if (val < (1ULL << 49)) return 7; + if (val < (1ULL << 56)) return 8; + if (val < (1ULL << 63)) return 9; + return 10; +} + +/** + * btree_varint_encode + * encodes a varint value into a buffer + * @param buf the buffer to encode into + * @param val the value to encode + * @return the number of bytes encoded + */ +static inline size_t btree_varint_encode(uint8_t *buf, uint64_t val) +{ + size_t i = 0; + while (val >= 0x80) + { + buf[i++] = (uint8_t)(val | 0x80); + val >>= 7; + } + buf[i++] = (uint8_t)val; + return i; +} + +/** + * btree_varint_decode + * decodes a varint value from a buffer + * @param buf the buffer to decode from + * @param val the value to decode + * @return the number of bytes decoded + */ +static inline size_t btree_varint_decode(const uint8_t *buf, uint64_t *val) +{ + uint64_t result = 0; + size_t shift = 0; + size_t i = 0; + while (buf[i] & 0x80) + { + result |= (uint64_t)(buf[i] & 0x7F) << shift; + shift += 7; + i++; + if (i >= 10) break; + } + result |= (uint64_t)buf[i] << shift; + *val = result; + return i + 1; +} + +/** + * btree_signed_varint_encode + * encodes a signed integer using zigzag encoding then varint + * @param buf the buffer to encode into + * @param val the signed value to encode + * @return the number of bytes encoded + */ +static inline size_t btree_signed_varint_encode(uint8_t *buf, const int64_t val) +{ + const uint64_t uval = ((uint64_t)val << 1) ^ (uint64_t)(val >> 63); + return btree_varint_encode(buf, uval); +} + +/** + * btree_signed_varint_decode + * decodes a zigzag-encoded signed varint from a buffer + * @param buf the buffer to decode from + * @param val output parameter for the decoded signed value + * @return the number of bytes decoded + */ +static inline size_t btree_signed_varint_decode(const uint8_t *buf, int64_t *val) +{ + uint64_t uval; + const size_t n = btree_varint_decode(buf, &uval); + *val = (int64_t)((uval >> 1) ^ (~(uval & 1) + 1)); + return n; +} + +/* bounded LEB128 decode for parsing on-disk (untrusted) node bytes in which reads at most + * the bytes remaining before `end` and at most 10. returns bytes consumed, or 0 on + * truncation / overlong encoding so the caller can reject a malformed node. */ +static inline size_t btree_varint_decode_bounded(const uint8_t *buf, const uint8_t *end, + uint64_t *val) +{ + uint64_t result = 0; + size_t shift = 0; + for (size_t i = 0; i < 10; i++) + { + if (buf + i >= end) return 0; + const uint8_t b = buf[i]; + result |= (uint64_t)(b & 0x7F) << shift; + if (!(b & 0x80)) + { + *val = result; + return i + 1; + } + shift += 7; + } + return 0; +} + +static inline size_t btree_signed_varint_decode_bounded(const uint8_t *buf, const uint8_t *end, + int64_t *val) +{ + uint64_t uval; + const size_t n = btree_varint_decode_bounded(buf, end, &uval); + if (n == 0) return 0; + *val = (int64_t)((uval >> 1) ^ (~(uval & 1) + 1)); + return n; +} + +/** + * btree_compute_prefix_len + * computes the common prefix length between two keys + * @param key1 first key data + * @param len1 length of first key + * @param key2 second key data + * @param len2 length of second key + * @return the number of common prefix bytes + */ +static inline size_t btree_compute_prefix_len(const uint8_t *key1, const size_t len1, + const uint8_t *key2, const size_t len2) +{ + const size_t min_len = (len1 < len2) ? len1 : len2; + size_t prefix_len = 0; + while (prefix_len < min_len && key1[prefix_len] == key2[prefix_len]) + { + prefix_len++; + } + return prefix_len; +} + +/** + * btree_arena_create + * creates a new arena allocator for bulk memory management + * @return new arena or NULL on failure + */ +btree_arena_t *btree_arena_create(void) +{ + btree_arena_t *arena = calloc(1, sizeof(btree_arena_t)); + if (!arena) return NULL; + + btree_arena_block_t *block = calloc(1, sizeof(btree_arena_block_t)); + if (!block) + { + free(arena); + return NULL; + } + + block->data = malloc(BTREE_ARENA_BLOCK_SIZE); + if (!block->data) + { + free(block); + free(arena); + return NULL; + } + + block->size = BTREE_ARENA_BLOCK_SIZE; + block->used = 0; + block->next = NULL; + + arena->current = block; + arena->blocks = block; + arena->total_allocated = BTREE_ARENA_BLOCK_SIZE; + + return arena; +} + +btree_arena_t *btree_arena_create_sized(size_t initial_capacity) +{ + if (initial_capacity < BTREE_ARENA_MIN_BLOCK_SIZE) + initial_capacity = BTREE_ARENA_MIN_BLOCK_SIZE; + + initial_capacity = (initial_capacity + 7) & ~(size_t)7; + + btree_arena_t *arena = malloc(sizeof(btree_arena_t)); + if (!arena) return NULL; + + btree_arena_block_t *block = malloc(sizeof(btree_arena_block_t)); + if (!block) + { + free(arena); + return NULL; + } + + block->data = malloc(initial_capacity); + if (!block->data) + { + free(block); + free(arena); + return NULL; + } + + block->size = initial_capacity; + block->used = 0; + block->next = NULL; + + arena->current = block; + arena->blocks = block; + arena->total_allocated = initial_capacity; + + return arena; +} + +/** + * btree_arena_alloc + * allocates memory from the arena with 8-byte alignment + * @param arena the arena to allocate from + * @param size number of bytes to allocate + * @return pointer to allocated memory or NULL on failure + */ +void *btree_arena_alloc(btree_arena_t *arena, size_t size) +{ + if (!arena || size == 0) return NULL; + + size = (size + (BTREE_ARENA_ALIGNMENT - 1)) & ~(size_t)(BTREE_ARENA_ALIGNMENT - 1); + + /* we check if current block has space */ + if (arena->current->used + size <= arena->current->size) + { + void *ptr = arena->current->data + arena->current->used; + arena->current->used += size; + return ptr; + } + + /* we need new block thus we allocate at least BTREE_ARENA_BLOCK_SIZE or size if larger */ + const size_t block_size = (size > BTREE_ARENA_BLOCK_SIZE) ? size : BTREE_ARENA_BLOCK_SIZE; + + btree_arena_block_t *block = calloc(1, sizeof(btree_arena_block_t)); + if (!block) return NULL; + + block->data = malloc(block_size); + if (!block->data) + { + free(block); + return NULL; + } + + block->size = block_size; + block->used = size; + block->next = arena->blocks; + arena->blocks = block; + arena->current = block; + arena->total_allocated += block_size; + + return block->data; +} + +/** + * btree_arena_destroy + * destroys an arena and frees all associated memory + * @param arena the arena to destroy + */ +void btree_arena_destroy(btree_arena_t *arena) +{ + if (!arena) return; + + btree_arena_block_t *block = arena->blocks; + while (block) + { + btree_arena_block_t *next = block->next; + free(block->data); + free(block); + block = next; + } + + free(arena); +} + +/** + * btree_arena_reset + * resets an arena for reuse without freeing memory + * @param arena the arena to reset + */ +void btree_arena_reset(btree_arena_t *arena) +{ + if (!arena) return; + + btree_arena_block_t *block = arena->blocks; + while (block) + { + block->used = 0; + block = block->next; + } + + arena->current = arena->blocks; +} + +/** + * btree_compare_keys_numeric_inline + * fast inline comparison for 8-byte numeric keys + * @param key1 first key (8 bytes) + * @param key2 second key (8 bytes) + * @return -1 if key1 < key2, 1 if key1 > key2, 0 if equal + */ +static inline int btree_compare_keys_numeric_inline(const uint8_t *key1, const uint8_t *key2) +{ + uint64_t v1, v2; + memcpy(&v1, key1, sizeof(uint64_t)); + memcpy(&v2, key2, sizeof(uint64_t)); + return (v1 < v2) ? -1 : (v1 > v2); +} + +#if defined(__GNUC__) || defined(__clang__) +#define BTREE_BSWAP64(x) __builtin_bswap64(x) +#elif defined(_MSC_VER) +#define BTREE_BSWAP64(x) _byteswap_uint64(x) +#else +static inline uint64_t BTREE_BSWAP64(uint64_t x) +{ + return ((x & 0xFF00000000000000ULL) >> 56) | ((x & 0x00FF000000000000ULL) >> 40) | + ((x & 0x0000FF0000000000ULL) >> 24) | ((x & 0x000000FF00000000ULL) >> 8) | + ((x & 0x00000000FF000000ULL) << 8) | ((x & 0x0000000000FF0000ULL) << 24) | + ((x & 0x000000000000FF00ULL) << 40) | ((x & 0x00000000000000FFULL) << 56); +} +#endif +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define BTREE_IS_BIG_ENDIAN 1 +#else +#define BTREE_IS_BIG_ENDIAN 0 +#endif + +/* lexicographic (memcmp-order) compare of two 8-byte keys via byte-swapped integer + * compare -- matches memcmp and the skip_list 8-byte path. distinct from + * btree_compare_keys_numeric_inline, which is native-endian for CMP_NUMERIC. */ +static inline int btree_compare_keys_8_memcmp_inline(const uint8_t *key1, const uint8_t *key2) +{ + uint64_t a, b; + memcpy(&a, key1, sizeof(uint64_t)); + memcpy(&b, key2, sizeof(uint64_t)); +#if !BTREE_IS_BIG_ENDIAN + a = BTREE_BSWAP64(a); + b = BTREE_BSWAP64(b); +#endif + return (a < b) ? -1 : (a > b); +} + +/** + * btree_compare_keys_inline + * inline comparator for hot paths + * @param config btree configuration containing comparator settings + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +static inline int btree_compare_keys_inline(const btree_config_t *config, const uint8_t *key1, + const size_t key1_size, const uint8_t *key2, + const size_t key2_size) +{ + if (BTREE_LIKELY(config->cmp_type == BTREE_CMP_MEMCMP)) + { + if (BTREE_LIKELY(key1_size == key2_size)) + { + if (key1_size == 8) + { + return btree_compare_keys_8_memcmp_inline(key1, key2); + } + const int cmp = memcmp(key1, key2, key1_size); + return (cmp == 0) ? 0 : ((cmp < 0) ? -1 : 1); + } + return btree_comparator_memcmp(key1, key1_size, key2, key2_size, NULL); + } + + switch (config->cmp_type) + { + case BTREE_CMP_NUMERIC: + return btree_compare_keys_numeric_inline(key1, key2); + case BTREE_CMP_STRING: + return btree_comparator_string(key1, key1_size, key2, key2_size, NULL); + case BTREE_CMP_CUSTOM: + default: + return config->comparator(key1, key1_size, key2, key2_size, config->comparator_ctx); + } +} + +int btree_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + const size_t min_size = key1_size < key2_size ? key1_size : key2_size; + const int cmp = memcmp(key1, key2, min_size); + if (cmp != 0) return cmp < 0 ? -1 : 1; + return (key1_size < key2_size) ? -1 : (key1_size > key2_size) ? 1 : 0; +} + +int btree_comparator_string(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + /* length-bounded compare, keys are byte buffers, not guaranteed NUL-terminated. + * strcmp here would read past the buffer on a non-terminated key. memcmp over the + * shorter length plus a length tie-break gives the same order as strcmp for + * well-formed C-string keys while staying in bounds. */ + const size_t min_size = key1_size < key2_size ? key1_size : key2_size; + const int cmp = memcmp(key1, key2, min_size); + if (cmp != 0) return cmp < 0 ? -1 : 1; + if (key1_size < key2_size) return -1; + if (key1_size > key2_size) return 1; + return 0; +} + +int btree_comparator_numeric(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)key1_size; + (void)key2_size; + (void)ctx; + uint64_t val1, val2; + memcpy(&val1, key1, sizeof(uint64_t)); + memcpy(&val2, key2, sizeof(uint64_t)); + if (val1 < val2) return -1; + if (val1 > val2) return 1; + return 0; +} + +/** + * btree_pending_leaf_t + * a leaf node being built during tree construction + * @param entries array of entry metadata + * @param keys array of key pointers + * @param values array of value pointers + * @param num_entries current number of entries + * @param capacity maximum capacity of arrays + * @param current_size current serialized size estimate + * @param first_key first key in this leaf (for separator) + * @param first_key_size size of first key + * @param last_key last key in this leaf + * @param last_key_size size of last key + */ +typedef struct btree_pending_leaf_t +{ + btree_entry_t *entries; + uint8_t **keys; + uint8_t **values; + uint32_t num_entries; + uint32_t capacity; + size_t current_size; + uint8_t *first_key; + size_t first_key_size; + uint8_t *last_key; + size_t last_key_size; +} btree_pending_leaf_t; + +/** + * btree_level_entry_t + * entry for building internal nodes (separator key + child offset) + * @param key separator key data + * @param key_size size of separator key + * @param child_offset offset of child node in storage + */ +typedef struct btree_level_entry_t +{ + uint8_t *key; + size_t key_size; + int64_t child_offset; +} btree_level_entry_t; + +/** + * btree_builder_t + * builder state for constructing B+tree from sorted data + * @param bm block manager for storage + * @param config btree configuration + * @param current_leaf leaf node currently being built + * @param first_leaf_offset offset of first leaf in tree + * @param last_leaf_offset offset of last leaf in tree + * @param prev_leaf_offset offset of previously written leaf + * @param leaf_offsets array of all leaf offsets for backpatching + * @param num_leaf_offsets number of leaf offsets + * @param leaf_offsets_capacity capacity of leaf_offsets array + * @param level_entries entries for building internal nodes + * @param num_level_entries number of level entries + * @param level_entries_capacity capacity of level_entries array + * @param entry_count total number of entries added + * @param node_count total number of nodes written + * @param max_seq maximum sequence number seen + * @param min_key minimum key in tree + * @param min_key_size size of minimum key + * @param max_key maximum key in tree + * @param max_key_size size of maximum key + */ +struct btree_builder_t +{ + block_manager_t *bm; + block_manager_t *leaf_bm; /* uncompressed leaves stage here -- a temp file + * when compression is on, so the real klog never + * keeps the discarded pre-compression copies */ + btree_config_t config; + + btree_pending_leaf_t *current_leaf; + int64_t first_leaf_offset; + int64_t last_leaf_offset; + int64_t prev_leaf_offset; + + int64_t *leaf_offsets; + uint32_t num_leaf_offsets; + uint32_t leaf_offsets_capacity; + + btree_level_entry_t *level_entries; + uint32_t num_level_entries; + uint32_t level_entries_capacity; + + uint64_t entry_count; + uint64_t node_count; + uint64_t max_seq; + uint32_t height; + + uint8_t *min_key; + size_t min_key_size; + uint8_t *max_key; + size_t max_key_size; +}; + +/** + * btree_leaf_serialize + * serializes a leaf node with optimized format: + * -- varint encoding for sizes and metadata + * -- prefix compression for keys + * -- key indirection table for O(1) access + * -- delta encoding for sequence numbers + * + * format: + * [type:1][num_entries:varint][prev_offset:8][next_offset:8] + * [key_offsets_table: num_entries * 2 bytes] -- offset from keys_start to each key + * [base_seq:varint][entries: prefix_len:varint, suffix_len:varint, value_size:varint, + * vlog_offset:varint, seq_delta:signed_varint, ttl:signed_varint, + * flags:1] [keys: prefix-compressed][values] + * + * @param leaf the pending leaf to serialize + * @param prev_offset offset of previous leaf node (-1 if first) + * @param next_offset offset of next leaf node (-1 if last) + * @param out output buffer (caller must free) + * @param out_size output size of serialized data + * @return 0 on success, -1 on failure + */ +static int btree_leaf_serialize(const btree_pending_leaf_t *leaf, const int64_t prev_offset, + const int64_t next_offset, uint8_t **out, size_t *out_size) +{ + if (!leaf || !out || !out_size) return -1; + if (leaf->num_entries == 0) + { + /* empty leaf -- minimal format */ + uint8_t *buffer = malloc(BTREE_LEAF_EMPTY_BUF_SIZE); + if (!buffer) return -1; + size_t off = 0; + buffer[off++] = BTREE_NODE_LEAF; + off += btree_varint_encode(buffer + off, 0); + encode_int64_le_compat(buffer + off, prev_offset); + off += 8; + encode_int64_le_compat(buffer + off, next_offset); + off += 8; + *out = buffer; + *out_size = off; + return 0; + } + + /* we compute prefix lengths and compressed key sizes */ + size_t *prefix_lens = malloc(leaf->num_entries * sizeof(size_t)); + size_t *suffix_lens = malloc(leaf->num_entries * sizeof(size_t)); + if (!prefix_lens || !suffix_lens) + { + free(prefix_lens); + free(suffix_lens); + return -1; + } + + /* first key has no prefix compression */ + prefix_lens[0] = 0; + suffix_lens[0] = leaf->entries[0].key_size; + + for (uint32_t i = 1; i < leaf->num_entries; i++) + { + prefix_lens[i] = btree_compute_prefix_len(leaf->keys[i - 1], leaf->entries[i - 1].key_size, + leaf->keys[i], leaf->entries[i].key_size); + suffix_lens[i] = leaf->entries[i].key_size - prefix_lens[i]; + } + + /* we find base sequence number (minimum) for delta encoding */ + uint64_t base_seq = leaf->entries[0].seq; + for (uint32_t i = 1; i < leaf->num_entries; i++) + { + if (leaf->entries[i].seq < base_seq) base_seq = leaf->entries[i].seq; + } + + /* we calculate total size needed */ + size_t est_size = 1; /* type */ + est_size += btree_varint_size(leaf->num_entries); /* num_entries */ + est_size += 16; /* prev/next offsets */ + est_size += leaf->num_entries * 2; /* key indirection table */ + est_size += btree_varint_size(base_seq); /* base_seq */ + + size_t keys_total = 0; + size_t values_total = 0; + for (uint32_t i = 0; i < leaf->num_entries; i++) + { + est_size += btree_varint_size(prefix_lens[i]); + est_size += btree_varint_size(suffix_lens[i]); + est_size += btree_varint_size(leaf->entries[i].value_size); + est_size += btree_varint_size(leaf->entries[i].vlog_offset); + const int64_t seq_delta = (int64_t)(leaf->entries[i].seq - base_seq); + est_size += btree_varint_size(((uint64_t)seq_delta << 1) ^ (uint64_t)(seq_delta >> 63)); + est_size += btree_varint_size(((uint64_t)leaf->entries[i].ttl << 1) ^ + (uint64_t)(leaf->entries[i].ttl >> 63)); + est_size += 1; /* flags */ + keys_total += suffix_lens[i]; + if (leaf->entries[i].vlog_offset == 0 && leaf->values[i]) + { + values_total += leaf->entries[i].value_size; + } + } + est_size += keys_total + values_total; + + uint8_t *buffer = malloc(est_size + BTREE_LEAF_SERIALIZE_SAFETY_PAD); + if (!buffer) + { + free(prefix_lens); + free(suffix_lens); + return -1; + } + + size_t off = 0; + + /* header */ + buffer[off++] = BTREE_NODE_LEAF; + off += btree_varint_encode(buffer + off, leaf->num_entries); + encode_int64_le_compat(buffer + off, prev_offset); + off += 8; + encode_int64_le_compat(buffer + off, next_offset); + off += 8; + + /* key indirection table placeholder -- we'll fill this after writing keys */ + const size_t indirection_table_pos = off; + off += leaf->num_entries * 2; + + /* base sequence number */ + off += btree_varint_encode(buffer + off, base_seq); + + /* entry metadata (varint encoded) */ + for (uint32_t i = 0; i < leaf->num_entries; i++) + { + off += btree_varint_encode(buffer + off, prefix_lens[i]); + off += btree_varint_encode(buffer + off, suffix_lens[i]); + off += btree_varint_encode(buffer + off, leaf->entries[i].value_size); + off += btree_varint_encode(buffer + off, leaf->entries[i].vlog_offset); + int64_t seq_delta = (int64_t)(leaf->entries[i].seq - base_seq); + off += btree_signed_varint_encode(buffer + off, seq_delta); + off += btree_signed_varint_encode(buffer + off, leaf->entries[i].ttl); + buffer[off++] = leaf->entries[i].flags; + } + + /* keys (prefix-compressed -- only suffix stored) */ + size_t keys_start = off; + for (uint32_t i = 0; i < leaf->num_entries; i++) + { + /* we write key offset as little-endian uint16. if the keys section exceeds + * 64KB the offset wraps and deserialization will read garbage. */ + const size_t raw_off = off - keys_start; + if (raw_off > UINT16_MAX) + { + free(prefix_lens); + free(suffix_lens); + return -1; + } + const uint16_t key_off = (uint16_t)raw_off; + buffer[indirection_table_pos + i * 2] = (uint8_t)(key_off & 0xFF); + buffer[indirection_table_pos + i * 2 + 1] = (uint8_t)((key_off >> 8) & 0xFF); + memcpy(buffer + off, leaf->keys[i] + prefix_lens[i], suffix_lens[i]); + off += suffix_lens[i]; + } + + /* values (inline only) */ + for (uint32_t i = 0; i < leaf->num_entries; i++) + { + if (leaf->entries[i].vlog_offset == 0 && leaf->values[i]) + { + memcpy(buffer + off, leaf->values[i], leaf->entries[i].value_size); + off += leaf->entries[i].value_size; + } + } + + free(prefix_lens); + free(suffix_lens); + + *out = buffer; + *out_size = off; + return 0; +} + +/** + * btree_internal_serialize + * serializes an internal node with optimized format: + * -- varint encoding for counts and key sizes + * -- delta encoding for child offsets + * -- prefix compression for separator keys + * + * format: + * [type:1][num_keys:varint][base_offset:8][child_offset_deltas:signed_varint*N] + * [key_sizes:varint*(N-1)][keys:prefix-compressed] + * + * @param entries internal node entries + * @param num_entries number of entries + * @param out output parameter for serialized node + * @param out_size output parameter for serialized node size + * @return 0 on success, -1 on failure + */ +static int btree_internal_serialize(const btree_level_entry_t *entries, const uint32_t num_entries, + uint8_t **out, size_t *out_size) +{ + if (!entries || num_entries == 0 || !out || !out_size) return -1; + + const uint32_t num_keys = (num_entries > 1) ? num_entries - 1 : 0; + const uint32_t num_children = num_entries; + + /* we estimate size needed */ + size_t est_size = 1; /* type */ + est_size += btree_varint_size(num_keys); /* num_keys */ + est_size += 8; /* base_offset */ + est_size += num_children * 10; /* child offset deltas (worst case) */ + + size_t keys_size = 0; + for (uint32_t i = 1; i < num_entries; i++) + { + est_size += btree_varint_size(entries[i].key_size); + keys_size += entries[i].key_size; + } + est_size += keys_size; + + uint8_t *buffer = malloc(est_size + BTREE_INTERNAL_SERIALIZE_SAFETY_PAD); + if (!buffer) return -1; + + size_t off = 0; + + buffer[off++] = BTREE_NODE_INTERNAL; + off += btree_varint_encode(buffer + off, num_keys); + + /* we base offset is the first child offset */ + const int64_t base_offset = entries[0].child_offset; + encode_int64_le_compat(buffer + off, base_offset); + off += 8; + + /* child offset deltas */ + int64_t prev_offset = base_offset; + for (uint32_t i = 0; i < num_children; i++) + { + const int64_t delta = entries[i].child_offset - prev_offset; + off += btree_signed_varint_encode(buffer + off, delta); + prev_offset = entries[i].child_offset; + } + + /* we separator key sizes (varint) */ + for (uint32_t i = 1; i < num_entries; i++) + { + off += btree_varint_encode(buffer + off, entries[i].key_size); + } + + for (uint32_t i = 1; i < num_entries; i++) + { + memcpy(buffer + off, entries[i].key, entries[i].key_size); + off += entries[i].key_size; + } + + *out = buffer; + *out_size = off; + return 0; +} + +/** + * btree_node_deserialize_arena + * deserializes a node from optimized format using arena allocation + * all memory is allocated from the arena for O(1) bulk deallocation + * @param data node bytes + * @param data_size node size + * @param node output parameter for deserialized node + * @param arena arena allocator to use + * @return 0 on success, -1 on failure + */ +static int btree_node_deserialize_arena(const uint8_t *data, const size_t data_size, + btree_node_t **node, btree_arena_t *arena) +{ + if (!data || data_size < 2 || !node || !arena) return -1; + + const uint8_t *const end = data + data_size; + + btree_node_t *n = btree_arena_alloc(arena, sizeof(btree_node_t)); + if (!n) return -1; + memset(n, 0, sizeof(btree_node_t)); + n->arena = arena; + + size_t off = 0; + n->type = data[off++]; /* data_size >= 2 guarantees this byte */ + + /* every read below is bounds-checked against data_size -- on-disk node bytes are + * untrusted (a malformed/truncated node must be rejected, never over-read). on a + * violation the caller destroys the arena, so we just return -1. */ +#define BT_NEED(want) \ + do \ + { \ + if (off > data_size || (size_t)(want) > data_size - off) return -1; \ + } while (0) +#define BT_VARINT(dst) \ + do \ + { \ + const size_t _vn = btree_varint_decode_bounded(data + off, end, &(dst)); \ + if (_vn == 0) return -1; \ + off += _vn; \ + } while (0) +#define BT_SVARINT(dst) \ + do \ + { \ + const size_t _vn = btree_signed_varint_decode_bounded(data + off, end, &(dst)); \ + if (_vn == 0) return -1; \ + off += _vn; \ + } while (0) + + uint64_t num_entries_u64; + BT_VARINT(num_entries_u64); + if (num_entries_u64 > UINT32_MAX) return -1; + n->num_entries = (uint32_t)num_entries_u64; + + if (n->type == BTREE_NODE_LEAF) + { + BT_NEED(16); + n->prev_offset = decode_int64_le_compat(data + off); + off += 8; + n->next_offset = decode_int64_le_compat(data + off); + off += 8; + + if (n->num_entries > 0) + { + const uint32_t ne = n->num_entries; + + /* the indirection table alone needs ne*2 bytes -- reject an ne that can't + * fit before allocating ne-sized arrays */ + BT_NEED((size_t)ne * 2); + + /* single arena alloc for all 4 metadata arrays */ + const size_t entries_sz = ne * sizeof(btree_entry_t); + const size_t keys_ptr_sz = ne * sizeof(uint8_t *); + const size_t key_sizes_sz = ne * sizeof(size_t); + const size_t values_ptr_sz = ne * sizeof(uint8_t *); + const size_t meta_total = entries_sz + keys_ptr_sz + key_sizes_sz + values_ptr_sz; + uint8_t *meta_buf = btree_arena_alloc(arena, meta_total); + if (!meta_buf) return -1; + + n->entries = (btree_entry_t *)meta_buf; + n->keys = (uint8_t **)(meta_buf + entries_sz); + n->key_sizes = (size_t *)(meta_buf + entries_sz + keys_ptr_sz); + n->values = (uint8_t **)(meta_buf + entries_sz + keys_ptr_sz + key_sizes_sz); + + /* only values needs zeroing (sparse -- vlog entries have no inline value) */ + memset(n->values, 0, values_ptr_sz); + + /* single arena alloc for all 3 temp arrays (align offsets_sz so size_t arrays + * start on an 8-byte boundary) */ + const size_t offsets_sz = ((ne * sizeof(uint16_t)) + 7) & ~(size_t)7; + const size_t lens_sz = ne * sizeof(size_t); + const size_t temp_total = offsets_sz + lens_sz + lens_sz; + uint8_t *temp_buf = btree_arena_alloc(arena, temp_total); + if (!temp_buf) return -1; + + uint16_t *key_offsets = (uint16_t *)temp_buf; + size_t *prefix_lens = (size_t *)(temp_buf + offsets_sz); + size_t *suffix_lens = (size_t *)(temp_buf + offsets_sz + lens_sz); + + /* we read key indirection table (stored as little-endian uint16) -- bounded + * by the BT_NEED(ne*2) above */ + for (uint32_t i = 0; i < ne; i++) + { + key_offsets[i] = (uint16_t)(data[off] | (data[off + 1] << 8)); + off += 2; + } + + /* we read base sequence number */ + uint64_t base_seq; + BT_VARINT(base_seq); + + /* we read entry metadata */ + for (uint32_t i = 0; i < ne; i++) + { + uint64_t prefix_len, suffix_len, value_size, vlog_offset; + int64_t seq_delta, ttl; + + BT_VARINT(prefix_len); + BT_VARINT(suffix_len); + BT_VARINT(value_size); + BT_VARINT(vlog_offset); + BT_SVARINT(seq_delta); + BT_SVARINT(ttl); + BT_NEED(1); /* flags byte */ + + /* key_size must fit uint32; prefix can't exceed the previous key's + * length (the prefix is copied from it during reconstruction) */ + const uint64_t key_size = prefix_len + suffix_len; + if (key_size > UINT32_MAX) return -1; + if (i == 0 ? (prefix_len != 0) : (prefix_len > n->entries[i - 1].key_size)) + return -1; + + prefix_lens[i] = (size_t)prefix_len; + suffix_lens[i] = (size_t)suffix_len; + n->entries[i].key_size = (uint32_t)key_size; + n->entries[i].value_size = (uint32_t)value_size; + n->entries[i].vlog_offset = vlog_offset; + n->entries[i].seq = base_seq + (uint64_t)seq_delta; + n->entries[i].ttl = ttl; + n->entries[i].flags = data[off++]; + n->key_sizes[i] = n->entries[i].key_size; + } + + /* single arena alloc for all key data, then carve up with pointers */ + size_t total_key_bytes = 0; + for (uint32_t i = 0; i < ne; i++) + { + total_key_bytes += ((size_t)n->entries[i].key_size + 7) & ~(size_t)7; + } + + uint8_t *key_buf = btree_arena_alloc(arena, total_key_bytes); + if (!key_buf) return -1; + + /* we reconstruct keys from prefix-compressed format */ + const size_t keys_start = off; + size_t key_buf_off = 0; + for (uint32_t i = 0; i < ne; i++) + { + n->keys[i] = key_buf + key_buf_off; + + /* we copy prefix from previous key (prefix_len validated <= prev key_size) */ + if (i > 0 && prefix_lens[i] > 0) + { + memcpy(n->keys[i], n->keys[i - 1], prefix_lens[i]); + } + + /* we copy suffix from serialized data -- the suffix region must lie + * entirely within the node */ + const size_t suffix_pos = keys_start + key_offsets[i]; + if (suffix_pos > data_size || suffix_lens[i] > data_size - suffix_pos) return -1; + memcpy(n->keys[i] + prefix_lens[i], data + suffix_pos, suffix_lens[i]); + + key_buf_off += ((size_t)n->entries[i].key_size + 7) & ~(size_t)7; + } + + /* we advance past all key data */ + for (uint32_t i = 0; i < ne; i++) + { + off += suffix_lens[i]; + } + if (off > data_size) return -1; /* keys section overran the node */ + + /* single arena alloc for all inline values, then point each into it */ + size_t total_inline_bytes = 0; + for (uint32_t i = 0; i < ne; i++) + { + if (n->entries[i].vlog_offset == 0 && n->entries[i].value_size > 0) + { + total_inline_bytes += n->entries[i].value_size; + if (total_inline_bytes > data_size) return -1; /* cap + overflow guard */ + } + } + + if (total_inline_bytes > 0) + { + BT_NEED(total_inline_bytes); + uint8_t *val_buf = btree_arena_alloc(arena, total_inline_bytes); + if (!val_buf) return -1; + memcpy(val_buf, data + off, total_inline_bytes); + + size_t val_off = 0; + for (uint32_t i = 0; i < ne; i++) + { + if (n->entries[i].vlog_offset == 0 && n->entries[i].value_size > 0) + { + n->values[i] = val_buf + val_off; + val_off += n->entries[i].value_size; + } + } + } + off += total_inline_bytes; + } + } + else if (n->type == BTREE_NODE_INTERNAL) + { + const uint32_t num_keys = n->num_entries; + const uint32_t num_children = num_keys + 1; + + /* single arena alloc for child_offsets + keys ptrs + key_sizes */ + const size_t child_sz = num_children * sizeof(int64_t); + const size_t ikeys_ptr_sz = num_keys * sizeof(uint8_t *); + const size_t ikey_sizes_sz = num_keys * sizeof(size_t); + const size_t internal_total = child_sz + ikeys_ptr_sz + ikey_sizes_sz; + uint8_t *ibuf = btree_arena_alloc(arena, internal_total); + if (!ibuf) return -1; + + n->child_offsets = (int64_t *)ibuf; + n->keys = (num_keys > 0) ? (uint8_t **)(ibuf + child_sz) : NULL; + n->key_sizes = (num_keys > 0) ? (size_t *)(ibuf + child_sz + ikeys_ptr_sz) : NULL; + + BT_NEED(8); + int64_t base_offset = decode_int64_le_compat(data + off); + off += 8; + + /* we decode delta-encoded child offsets */ + int64_t prev_offset = base_offset; + for (uint32_t i = 0; i < num_children; i++) + { + int64_t delta; + BT_SVARINT(delta); + n->child_offsets[i] = prev_offset + delta; + prev_offset = n->child_offsets[i]; + } + + /* we read key sizes (varint) */ + for (uint32_t i = 0; i < num_keys; i++) + { + uint64_t key_size; + BT_VARINT(key_size); + if (key_size > UINT32_MAX) return -1; + n->key_sizes[i] = (size_t)key_size; + } + + /* single arena alloc for all separator key data */ + size_t total_ikey_bytes = 0; + for (uint32_t i = 0; i < num_keys; i++) + { + total_ikey_bytes += (n->key_sizes[i] + 7) & ~(size_t)7; + } + + if (total_ikey_bytes > 0) + { + uint8_t *ikey_buf = btree_arena_alloc(arena, total_ikey_bytes); + if (!ikey_buf) return -1; + + size_t ikey_off = 0; + for (uint32_t i = 0; i < num_keys; i++) + { + n->keys[i] = ikey_buf + ikey_off; + BT_NEED(n->key_sizes[i]); + memcpy(n->keys[i], data + off, n->key_sizes[i]); + off += n->key_sizes[i]; + ikey_off += (n->key_sizes[i] + 7) & ~(size_t)7; + } + } + } + +#undef BT_NEED +#undef BT_VARINT +#undef BT_SVARINT + + *node = n; + return 0; +} + +void btree_node_free(btree_node_t *node) +{ + if (!node) return; + + /* for arena-allocated nodes we destroy arena for O(1) bulk deallocation + * for uncached nodes only -- cached nodes use btree_cached_node_release */ + if (node->arena) + { + btree_arena_destroy(node->arena); + return; + } + + if (node->keys) + { + for (uint32_t i = 0; i < node->num_entries; i++) + { + free(node->keys[i]); + } + free(node->keys); + } + + if (node->values) + { + for (uint32_t i = 0; i < node->num_entries; i++) + { + free(node->values[i]); + } + free(node->values); + } + + free(node->entries); + free(node->key_sizes); + free(node->child_offsets); + free(node); +} + +/** + * btree_cached_node_release + * release a reference to a cached btree node + * frees the node when the last reference is released + * @param node the cached node to release + */ +static void btree_cached_node_release(btree_node_t *node) +{ + if (!node) return; + if (atomic_fetch_sub_explicit(&node->rc_count, 1, memory_order_acq_rel) == 1) + { + if (node->arena) + { + btree_arena_destroy(node->arena); + } + else + { + btree_node_free(node); + } + } +} + +/** + * btree_node_done + * release a node returned by btree_node_read_cached + * handles both cached (ref-counted) and non-cached (direct free) nodes + * @param node the node to release + * @param cached 1 if node came from cache, 0 if direct read + */ +static inline void btree_node_done(btree_node_t *node, const int cached) +{ + if (!node) return; + if (cached) + btree_cached_node_release(node); + else + btree_node_free(node); +} + +static void btree_node_cache_evict_callback(void *payload, size_t payload_len) +{ + if (payload && payload_len == sizeof(btree_node_t *)) + { + btree_node_t *node; + memcpy(&node, payload, sizeof(btree_node_t *)); + if (node) btree_cached_node_release(node); + } +} + +int btree_node_read(block_manager_t *bm, const int64_t offset, btree_node_t **node) +{ + return btree_node_read_with_compression(bm, offset, node, TDB_COMPRESS_NONE); +} + +int btree_node_read_with_compression(block_manager_t *bm, const int64_t offset, btree_node_t **node, + const int compression_algo) +{ + if (!bm || offset < 0 || !node) return -1; + + block_manager_cursor_t cursor; + if (block_manager_cursor_init_stack(&cursor, bm) != 0) return -1; + + if (block_manager_cursor_goto(&cursor, (uint64_t)offset) != 0) return -1; + + block_manager_block_t *block = block_manager_cursor_read(&cursor); + if (!block) return -1; + + /* we decompress if compression is enabled + * format -- [original_size:4][prev_offset:8][next_offset:8][compressed_data] */ + const uint8_t *data = block->data; + size_t data_size = block->size; + uint8_t *decompressed = NULL; + + if (compression_algo != TDB_COMPRESS_NONE && block->size > BTREE_COMPRESSED_NODE_HEADER_SIZE) + { + const uint8_t *block_data = (const uint8_t *)block->data; + const uint32_t original_size = decode_uint32_le_compat(block_data); + const int64_t header_prev_offset = + decode_int64_le_compat(block_data + BTREE_COMPRESSED_NODE_PREV_OFF); + const int64_t header_next_offset = + decode_int64_le_compat(block_data + BTREE_COMPRESSED_NODE_NEXT_OFF); + const uint8_t *compressed_data = block_data + BTREE_COMPRESSED_NODE_HEADER_SIZE; + const size_t compressed_size = block->size - BTREE_COMPRESSED_NODE_HEADER_SIZE; + + size_t decompressed_size; + decompressed = decompress_data(compressed_data, compressed_size, &decompressed_size, + (compression_algorithm)compression_algo); + if (decompressed && decompressed_size == original_size) + { + /* we only patch prev_offset and next_offset for leaf nodes, not internal nodes */ + if (decompressed[0] == BTREE_NODE_LEAF) + { + /* we calculate position -- type(1) + num_entries(varint) */ + size_t pos = 1; + uint64_t num_entries; + pos += btree_varint_decode(decompressed + pos, &num_entries); + /* now pos points to prev_offset -- we write in little-endian format */ + encode_int64_le_compat(decompressed + pos, header_prev_offset); + encode_int64_le_compat(decompressed + pos + 8, header_next_offset); + } + data = decompressed; + data_size = decompressed_size; + } + else + { + free(decompressed); + block_manager_block_free(block); + return -1; + } + } + + /* we use arena allocation to eliminate N+7 individual malloc/free per node read + * btree_node_free will destroy the arena via O(1) bulk deallocation. + * we size the arena to data_size * 2 since deserialized form (pointers, arrays) + * is typically 1-2x the serialized size. avoids 64KB default for small nodes. */ + btree_arena_t *arena = btree_arena_create_sized(data_size * 2); + if (!arena) + { + free(decompressed); + block_manager_block_free(block); + return -1; + } + + const int result = btree_node_deserialize_arena(data, data_size, node, arena); + if (result == 0) + { + (*node)->block_offset = offset; + } + else + { + btree_arena_destroy(arena); + } + + free(decompressed); + block_manager_block_free(block); + return result; +} + +/** + * btree_u64_to_hex + * fast uint64 to hex string conversion (avoids snprintf overhead) + * @param val value to convert + * @param buf output buffer (must be at least 17 bytes) + * @return number of characters written + */ +static inline int btree_u64_to_hex(uint64_t val, char *buf) +{ + static const char hex_chars[] = "0123456789abcdef"; + if (val == 0) + { + buf[0] = '0'; + return 1; + } + char tmp[BTREE_U64_HEX_MAX]; + int len = 0; + while (val > 0) + { + tmp[len++] = hex_chars[val & 0xF]; + val >>= 4; + } + for (int i = 0; i < len; i++) + { + buf[i] = tmp[len - 1 - i]; + } + return len; +} + +int btree_format_cache_key_prefix(const uint64_t cache_key_prefix, char *out) +{ + if (!out) return 0; + int len = btree_u64_to_hex(cache_key_prefix, out); + out[len++] = BTREE_CACHE_KEY_SEPARATOR; + return len; +} + +/** + * btree_node_read_cached + * reads a node with caching support + * caches deserialized nodes directly for maximum performance + * if cache hit, returns pointer to cached node (caller must not free) + * if cache miss, reads from disk, deserializes, and caches + * @param tree btree instance + * @param offset node offset + * @param node output parameter for deserialized node + * @return 0 on success, -1 on failure + */ +static int btree_node_read_cached(btree_t *tree, const int64_t offset, btree_node_t **node) +{ + if (!tree || !tree->bm || offset < 0 || !node) return -1; + + /* if no cache, we fall back to direct read with compression */ + if (!tree->node_cache) + { + return btree_node_read_with_compression(tree->bm, offset, node, + tree->config.compression_algo); + } + + char cache_key[BTREE_CACHE_KEY_SIZE]; + int key_len = btree_format_cache_key_prefix(tree->cache_key_prefix, cache_key); + key_len += btree_u64_to_hex((uint64_t)offset, cache_key + key_len); + + size_t cached_size = 0; + clock_cache_entry_t *entry = NULL; + const uint8_t *cached_ptr = clock_cache_get_zero_copy(tree->node_cache, cache_key, + (size_t)key_len, &cached_size, &entry); + + if (cached_ptr && cached_size == sizeof(btree_node_t *)) + { + /* cache hit -- acquire caller ref before releasing cache entry + * this prevents eviction from freeing the node while we use it */ + btree_node_t *cached_node; + memcpy(&cached_node, cached_ptr, sizeof(btree_node_t *)); + atomic_fetch_add_explicit(&cached_node->rc_count, 1, memory_order_relaxed); + clock_cache_release(entry); + *node = cached_node; + return 0; + } + + if (entry) clock_cache_release(entry); + + /* cache miss! we read from disk (block manager handles checksum verification) */ + block_manager_cursor_t cursor; + if (block_manager_cursor_init_stack(&cursor, tree->bm) != 0) return -1; + + if (block_manager_cursor_goto(&cursor, (uint64_t)offset) != 0) return -1; + + block_manager_block_t *block = block_manager_cursor_read(&cursor); + if (!block) return -1; + + /* we decompress if compression is enabled + * format -- [original_size:4][prev_offset:8][next_offset:8][compressed_data] */ + const uint8_t *data = block->data; + size_t data_size = block->size; + uint8_t *decompressed = NULL; + + if (tree->config.compression_algo != TDB_COMPRESS_NONE && block->size > 20) + { + const uint8_t *block_data = (const uint8_t *)block->data; + const uint32_t original_size = decode_uint32_le_compat(block_data); + int64_t header_prev_offset = decode_int64_le_compat(block_data + 4); + int64_t header_next_offset = decode_int64_le_compat(block_data + 12); + const uint8_t *compressed_data = block_data + 20; + const size_t compressed_size = block->size - 20; + + size_t decompressed_size; + decompressed = decompress_data(compressed_data, compressed_size, &decompressed_size, + (compression_algorithm)tree->config.compression_algo); + if (decompressed && decompressed_size == original_size) + { + /* we only patch prev_offset and next_offset for leaf nodes, not internal nodes */ + if (decompressed[0] == BTREE_NODE_LEAF) + { + /* we calculate position, type(1) + num_entries(varint) */ + size_t pos = 1; + uint64_t num_entries; + pos += btree_varint_decode(decompressed + pos, &num_entries); + /* now pos points to prev_offset - write in little-endian format */ + encode_int64_le_compat(decompressed + pos, header_prev_offset); + encode_int64_le_compat(decompressed + pos + 8, header_next_offset); + } + data = decompressed; + data_size = decompressed_size; + } + else + { + free(decompressed); + block_manager_block_free(block); + return -1; + } + } + + btree_node_t *new_node = NULL; + btree_arena_t *node_arena = btree_arena_create_sized(data_size * 2); + if (!node_arena) + { + free(decompressed); + block_manager_block_free(block); + return -1; + } + + const int result = btree_node_deserialize_arena(data, data_size, &new_node, node_arena); + free(decompressed); + block_manager_block_free(block); + + if (result != 0) + { + btree_arena_destroy(node_arena); + return -1; + } + + new_node->block_offset = offset; + new_node->arena = node_arena; + + /* rc_count = 2, 1 for cache ownership + 1 for caller */ + atomic_store_explicit(&new_node->rc_count, 2, memory_order_relaxed); + + /* we account for actual memory cost i.e node struct + arena allocations. + * without this the cache treats every node as 0 bytes and never evicts, + * causing unbounded memory growth under btree workloads. */ + const size_t node_cost = sizeof(btree_node_t) + node_arena->total_allocated; + clock_cache_put(tree->node_cache, cache_key, (size_t)key_len, &new_node, sizeof(btree_node_t *), + node_cost); + + *node = new_node; + return 0; +} + +/** + * btree_pending_leaf_create + * creates a new pending leaf for building during tree construction + * @return new pending leaf or NULL on failure + */ +static btree_pending_leaf_t *btree_pending_leaf_create(void) +{ + btree_pending_leaf_t *leaf = calloc(1, sizeof(btree_pending_leaf_t)); + if (!leaf) return NULL; + + leaf->capacity = BTREE_PENDING_LEAF_INITIAL_CAP; + leaf->entries = calloc(leaf->capacity, sizeof(btree_entry_t)); + leaf->keys = calloc(leaf->capacity, sizeof(uint8_t *)); + leaf->values = calloc(leaf->capacity, sizeof(uint8_t *)); + + if (!leaf->entries || !leaf->keys || !leaf->values) + { + free(leaf->entries); + free(leaf->keys); + free(leaf->values); + free(leaf); + return NULL; + } + + return leaf; +} + +/** + * btree_pending_leaf_free + * frees a pending leaf and all associated memory + * @param leaf the pending leaf to free + */ +static void btree_pending_leaf_free(btree_pending_leaf_t *leaf) +{ + if (!leaf) return; + + for (uint32_t i = 0; i < leaf->num_entries; i++) + { + free(leaf->keys[i]); + free(leaf->values[i]); + } + + free(leaf->entries); + free(leaf->keys); + free(leaf->values); + free(leaf->first_key); + free(leaf->last_key); + free(leaf); +} + +/** + * btree_pending_leaf_add + * adds an entry to a pending leaf during tree construction + * @param leaf the pending leaf to add to + * @param key key data + * @param key_size size of key + * @param value value data (may be NULL if vlog_offset > 0) + * @param value_size size of value + * @param vlog_offset offset in value log (0 for inline values) + * @param seq sequence number + * @param ttl time-to-live (-1 for no expiry) + * @param flags entry flags (tombstone, etc.) + * @return 0 on success, -1 on failure + */ +static int btree_pending_leaf_add(btree_pending_leaf_t *leaf, const uint8_t *key, + const size_t key_size, const uint8_t *value, + const size_t value_size, const uint64_t vlog_offset, + const uint64_t seq, const int64_t ttl, const uint8_t flags) +{ + if (leaf->num_entries >= leaf->capacity) + { + const uint32_t new_capacity = leaf->capacity * 2; + btree_entry_t *new_entries = realloc(leaf->entries, new_capacity * sizeof(btree_entry_t)); + uint8_t **new_keys = realloc(leaf->keys, new_capacity * sizeof(uint8_t *)); + uint8_t **new_values = realloc(leaf->values, new_capacity * sizeof(uint8_t *)); + + if (!new_entries || !new_keys || !new_values) + { + return -1; + } + + leaf->entries = new_entries; + leaf->keys = new_keys; + leaf->values = new_values; + leaf->capacity = new_capacity; + + for (uint32_t i = leaf->num_entries; i < new_capacity; i++) + { + leaf->keys[i] = NULL; + leaf->values[i] = NULL; + } + } + + const uint32_t idx = leaf->num_entries; + + leaf->keys[idx] = malloc(key_size); + if (!leaf->keys[idx]) return -1; + memcpy(leaf->keys[idx], key, key_size); + + if (vlog_offset == 0 && value && value_size > 0) + { + leaf->values[idx] = malloc(value_size); + if (!leaf->values[idx]) + { + free(leaf->keys[idx]); + leaf->keys[idx] = NULL; + return -1; + } + memcpy(leaf->values[idx], value, value_size); + } + else + { + leaf->values[idx] = NULL; + } + + leaf->entries[idx].key_size = (uint32_t)key_size; + leaf->entries[idx].value_size = (uint32_t)value_size; + leaf->entries[idx].vlog_offset = vlog_offset; + leaf->entries[idx].seq = seq; + leaf->entries[idx].ttl = ttl; + leaf->entries[idx].flags = flags; + + if (leaf->num_entries == 0) + { + leaf->first_key = malloc(key_size); + if (leaf->first_key) + { + memcpy(leaf->first_key, key, key_size); + leaf->first_key_size = key_size; + } + } + + free(leaf->last_key); + leaf->last_key = malloc(key_size); + if (leaf->last_key) + { + memcpy(leaf->last_key, key, key_size); + leaf->last_key_size = key_size; + } + + leaf->current_size += key_size + (vlog_offset == 0 ? value_size : 0) + sizeof(btree_entry_t); + leaf->num_entries++; + + return 0; +} + +int btree_builder_new(btree_builder_t **builder, block_manager_t *bm, const btree_config_t *config) +{ + if (!builder || !bm || !config) return -1; + + btree_builder_t *b = calloc(1, sizeof(btree_builder_t)); + if (!b) return -1; + + b->bm = bm; + b->config = *config; + + if (!b->config.comparator) + { + b->config.comparator = btree_comparator_memcmp; + b->config.cmp_type = BTREE_CMP_MEMCMP; + } + + if (b->config.target_node_size == 0) + { + b->config.target_node_size = BTREE_DEFAULT_NODE_SIZE; + } + + b->current_leaf = btree_pending_leaf_create(); + if (!b->current_leaf) + { + free(b); + return -1; + } + + b->first_leaf_offset = -1; + b->last_leaf_offset = -1; + b->prev_leaf_offset = -1; + + b->leaf_offsets_capacity = 256; + b->leaf_offsets = calloc(b->leaf_offsets_capacity, sizeof(int64_t)); + if (!b->leaf_offsets) + { + btree_pending_leaf_free(b->current_leaf); + free(b); + return -1; + } + + b->level_entries_capacity = 256; + b->level_entries = calloc(b->level_entries_capacity, sizeof(btree_level_entry_t)); + if (!b->level_entries) + { + free(b->leaf_offsets); + btree_pending_leaf_free(b->current_leaf); + free(b); + return -1; + } + + /* uncompressed leaves are staged before compression. with compression on, + * stage them in a temp file so the klog receives only the final compressed + * leaves -- staging them in the klog would leave the discarded uncompressed + * copies behind as permanent dead weight. with compression off the first + * write is already final, so stage straight into the klog. */ + b->leaf_bm = bm; + if (b->config.compression_algo != TDB_COMPRESS_NONE) + { + /* sizeof the suffix literal already includes its null terminator, so this + * holds a full-length file_path plus the suffix without truncation */ + char tmp_path[MAX_FILE_PATH_LENGTH + sizeof(BTREE_LEAF_STAGE_SUFFIX)]; + snprintf(tmp_path, sizeof(tmp_path), "%s" BTREE_LEAF_STAGE_SUFFIX, bm->file_path); + block_manager_t *tmp_bm = NULL; + if (block_manager_open(&tmp_bm, tmp_path, BLOCK_MANAGER_SYNC_NONE) == 0 && + block_manager_truncate(tmp_bm) == 0) + { + b->leaf_bm = tmp_bm; + } + else if (tmp_bm) + { + /* temp file unavailable -- fall back to staging in the klog so the + * build still succeeds (correctness over space) */ + block_manager_close(tmp_bm); + } + } + + *builder = b; + return 0; +} + +/** + * btree_builder_flush_leaf + * flushes the current pending leaf to storage + * @param builder the builder instance + * @return 0 on success, -1 on failure + */ +static int btree_builder_flush_leaf(btree_builder_t *builder) +{ + if (!builder || !builder->current_leaf || builder->current_leaf->num_entries == 0) + { + return 0; + } + + uint8_t *serialized = NULL; + size_t serialized_size = 0; + + if (btree_leaf_serialize(builder->current_leaf, builder->prev_leaf_offset, -1, &serialized, + &serialized_size) != 0) + { + return -1; + } + + /**** leaf nodes are written without compression during build phase + *** because we need to backpatch next_offset links after all leaves are written. + ** compression is applied during the backpatch phase after patching. + * we use from_buffer to transfer ownership and avoid redundant malloc+memcpy */ + block_manager_block_t *block = + block_manager_block_create_from_buffer(serialized_size, serialized); + + if (!block) return -1; + + const int64_t offset = block_manager_block_write(builder->leaf_bm, block); + block_manager_block_free(block); + + if (offset < 0) return -1; + + /* we track leaf offset for bidirectional linking */ + if (builder->num_leaf_offsets >= builder->leaf_offsets_capacity) + { + const uint32_t new_cap = builder->leaf_offsets_capacity * 2; + int64_t *new_offsets = realloc(builder->leaf_offsets, new_cap * sizeof(int64_t)); + if (!new_offsets) return -1; + builder->leaf_offsets = new_offsets; + builder->leaf_offsets_capacity = new_cap; + } + builder->leaf_offsets[builder->num_leaf_offsets++] = offset; + + if (builder->first_leaf_offset < 0) + { + builder->first_leaf_offset = offset; + } + builder->last_leaf_offset = offset; + + if (builder->num_level_entries >= builder->level_entries_capacity) + { + const uint32_t new_cap = builder->level_entries_capacity * 2; + btree_level_entry_t *new_entries = + realloc(builder->level_entries, new_cap * sizeof(btree_level_entry_t)); + if (!new_entries) return -1; + builder->level_entries = new_entries; + builder->level_entries_capacity = new_cap; + } + + btree_level_entry_t *entry = &builder->level_entries[builder->num_level_entries]; + entry->key = malloc(builder->current_leaf->first_key_size); + if (!entry->key) return -1; + memcpy(entry->key, builder->current_leaf->first_key, builder->current_leaf->first_key_size); + entry->key_size = builder->current_leaf->first_key_size; + entry->child_offset = offset; + builder->num_level_entries++; + + builder->prev_leaf_offset = offset; + builder->node_count++; + + btree_pending_leaf_free(builder->current_leaf); + builder->current_leaf = btree_pending_leaf_create(); + + return builder->current_leaf ? 0 : -1; +} + +int btree_builder_add(btree_builder_t *builder, const uint8_t *key, const size_t key_size, + const uint8_t *value, const size_t value_size, const uint64_t vlog_offset, + const uint64_t seq, const int64_t ttl, const uint8_t entry_flags) +{ + if (!builder || !key || key_size == 0) return -1; + + uint8_t flags = entry_flags & (BTREE_ENTRY_FLAG_TOMBSTONE | BTREE_ENTRY_FLAG_SINGLE_DELETE); + if (ttl != 0) flags |= BTREE_ENTRY_FLAG_HAS_TTL; + if (vlog_offset > 0) flags |= BTREE_ENTRY_FLAG_VLOG_REF; + + /* we flush the full leaf before adding -- but never across a run of entries + * that share a key. a key's versions must all stay within one leaf so + * internal-node routing lands on the single leaf holding them and btree_get + * can resolve the whole run. */ + if (builder->current_leaf->current_size >= builder->config.target_node_size && + builder->current_leaf->num_entries >= BTREE_MIN_ENTRIES_PER_LEAF) + { + const btree_pending_leaf_t *cur = builder->current_leaf; + const int same_key_as_last = cur->last_key != NULL && cur->last_key_size == key_size && + memcmp(cur->last_key, key, key_size) == 0; + if (!same_key_as_last && btree_builder_flush_leaf(builder) != 0) + { + return -1; + } + } + + if (btree_pending_leaf_add(builder->current_leaf, key, key_size, value, value_size, vlog_offset, + seq, ttl, flags) != 0) + { + return -1; + } + + if (builder->min_key == NULL) + { + builder->min_key = malloc(key_size); + if (builder->min_key) + { + memcpy(builder->min_key, key, key_size); + builder->min_key_size = key_size; + } + } + + free(builder->max_key); + builder->max_key = malloc(key_size); + if (builder->max_key) + { + memcpy(builder->max_key, key, key_size); + builder->max_key_size = key_size; + } + + if (seq > builder->max_seq) + { + builder->max_seq = seq; + } + + builder->entry_count++; + return 0; +} + +/** + * btree_builder_build_internal_levels + * builds internal node levels from leaf level entries + * @param builder the builder instance + * @param root_offset output parameter for the root node offset + * @return 0 on success, -1 on failure + */ +static int btree_builder_build_internal_levels(btree_builder_t *builder, int64_t *root_offset) +{ + if (builder->num_level_entries == 0) + { + builder->height = 1; + *root_offset = -1; + return 0; + } + + if (builder->num_level_entries == 1) + { + builder->height = 1; /* a single leaf is the whole tree */ + *root_offset = builder->level_entries[0].child_offset; + return 0; + } + + btree_level_entry_t *current_level = builder->level_entries; + uint32_t current_count = builder->num_level_entries; + + /* each pass of the loop builds one internal level above the leaf level */ + uint32_t internal_levels = 0; + + while (current_count > 1) + { + const uint32_t next_capacity = (current_count / BTREE_DEFAULT_FANOUT) + 1; + btree_level_entry_t *next_level = calloc(next_capacity, sizeof(btree_level_entry_t)); + if (!next_level) return -1; + + uint32_t next_count = 0; + uint32_t i = 0; + + while (i < current_count) + { + uint32_t node_entries = BTREE_DEFAULT_FANOUT; + if (i + node_entries > current_count) + { + node_entries = current_count - i; + } + + uint8_t *serialized = NULL; + size_t serialized_size = 0; + + if (btree_internal_serialize(¤t_level[i], node_entries, &serialized, + &serialized_size) != 0) + { + for (uint32_t j = 0; j < next_count; j++) + { + free(next_level[j].key); + } + free(next_level); + return -1; + } + + /**** we compress if compression is enabled + *** format -- [original_size:4][prev_offset:8][next_offset:8][compressed_data] + ** internal nodes use prev_offset=-1 and next_offset=-1 (unused) for consistent + *format + */ + const uint8_t *final_data = serialized; + size_t final_size = serialized_size; + uint8_t *block_with_header = NULL; + + if (builder->config.compression_algo != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = + compress_data(serialized, serialized_size, &compressed_size, + (compression_algorithm)builder->config.compression_algo); + if (compressed) + { + /** we create block with header: + * [original_size:4][prev_offset:8][next_offset:8][compressed_data] */ + const size_t header_size = 4 + 8 + 8; + final_size = header_size + compressed_size; + block_with_header = malloc(final_size); + if (block_with_header) + { + encode_uint32_le_compat(block_with_header, (uint32_t)serialized_size); + int64_t unused_prev = -1; + int64_t unused_next = -1; + encode_int64_le_compat(block_with_header + 4, unused_prev); + encode_int64_le_compat(block_with_header + 12, unused_next); + memcpy(block_with_header + header_size, compressed, compressed_size); + final_data = block_with_header; + } + free(compressed); + } + } + + block_manager_block_t *block = block_manager_block_create(final_size, final_data); + free(serialized); + free(block_with_header); + + if (!block) + { + for (uint32_t j = 0; j < next_count; j++) + { + free(next_level[j].key); + } + free(next_level); + return -1; + } + + const int64_t offset = block_manager_block_write(builder->bm, block); + block_manager_block_free(block); + + if (offset < 0) + { + for (uint32_t j = 0; j < next_count; j++) + { + free(next_level[j].key); + } + free(next_level); + return -1; + } + + next_level[next_count].key = malloc(current_level[i].key_size); + if (next_level[next_count].key) + { + memcpy(next_level[next_count].key, current_level[i].key, current_level[i].key_size); + next_level[next_count].key_size = current_level[i].key_size; + } + next_level[next_count].child_offset = offset; + next_count++; + + builder->node_count++; + i += node_entries; + } + + if (current_level != builder->level_entries) + { + for (uint32_t j = 0; j < current_count; j++) + { + free(current_level[j].key); + } + free(current_level); + } + + current_level = next_level; + current_count = next_count; + internal_levels++; + } + + builder->height = 1 + internal_levels; + *root_offset = current_level[0].child_offset; + + if (current_level != builder->level_entries) + { + for (uint32_t j = 0; j < current_count; j++) + { + free(current_level[j].key); + } + free(current_level); + } + + return 0; +} + +/** + * btree_builder_backpatch_leaf_links + * patches next_offset in each leaf to point to the next leaf + * this enables O(1) forward iteration through leaves + * + * block format -- [size(4)][checksum(4)][data][size(4)][magic(4)] + * leaf data format -- [type:1][num_entries:varint][prev_offset:8][next_offset:8]... + * + * @param builder the builder instance + * @return 0 on success, -1 on failure + */ +static int btree_builder_backpatch_leaf_links(btree_builder_t *builder) +{ + if (!builder || builder->num_leaf_offsets == 0) return 0; + + /* block header -- [size(4)][checksum(4)] = 8 bytes before data */ + const size_t block_header_size = BLOCK_MANAGER_BLOCK_HEADER_SIZE; + + /* we backpatch all leaves in place (theyre uncompressed at this point) + * only needed if there are 2+ leaves */ + for (uint32_t i = 0; i + 1 < builder->num_leaf_offsets; i++) + { + const int64_t leaf_offset = builder->leaf_offsets[i]; + int64_t next_leaf_offset = builder->leaf_offsets[i + 1]; + + block_manager_cursor_t cursor; + cursor.bm = builder->leaf_bm; + cursor.current_pos = leaf_offset; + cursor.block_size_valid = 0; + + block_manager_block_t *block = block_manager_cursor_read(&cursor); + if (!block) return -1; + + /* we calculate next_offset position type(1) + num_entries(varint) + prev_offset(8) */ + uint8_t *block_data = (uint8_t *)block->data; + size_t off = 1; /* skip type byte */ + uint64_t num_entries; + off += btree_varint_decode(block_data + off, &num_entries); + off += 8; /* skip prev_offset, now at next_offset position */ + + memcpy(block_data + off, &next_leaf_offset, sizeof(int64_t)); + + const uint32_t new_checksum = XXH32(block->data, block->size, 0); + + uint8_t checksum_bytes[4]; + encode_uint32_le_compat(checksum_bytes, new_checksum); + if (block_manager_write_at(builder->leaf_bm, leaf_offset + BLOCK_MANAGER_SIZE_FIELD_SIZE, + checksum_bytes, 4) != 0) + { + block_manager_block_free(block); + return -1; + } + + if (block_manager_write_at(builder->leaf_bm, leaf_offset + block_header_size + off, + (uint8_t *)&next_leaf_offset, sizeof(int64_t)) != 0) + { + block_manager_block_free(block); + return -1; + } + + block_manager_block_free(block); + } + + /* if compression enabled, compress all leaves and write to new locations + * format is [original_size:4][next_offset:8][compressed_data] stored in block + * next_offset is stored in header so it can be patched without decompression */ + if (builder->config.compression_algo != TDB_COMPRESS_NONE) + { + int64_t *new_offsets = malloc(builder->num_leaf_offsets * sizeof(int64_t)); + if (!new_offsets) return -1; + + /* we compress and write all leaves with placeholder next_offset=-1 */ + for (uint32_t i = 0; i < builder->num_leaf_offsets; i++) + { + block_manager_cursor_t cursor; + cursor.bm = builder->leaf_bm; + cursor.current_pos = builder->leaf_offsets[i]; + cursor.block_size_valid = 0; + + block_manager_block_t *block = block_manager_cursor_read(&cursor); + if (!block) + { + free(new_offsets); + return -1; + } + + /* we compress data (includes next_offset in the serialized leaf data) */ + size_t compressed_size; + uint8_t *compressed = + compress_data(block->data, block->size, &compressed_size, + (compression_algorithm)builder->config.compression_algo); + const uint32_t original_size = (uint32_t)block->size; + block_manager_block_free(block); + + if (!compressed) + { + free(new_offsets); + return -1; + } + + /* we create block with header: + * [original_size:4][prev_offset:8][next_offset:8][compressed_data] */ + const size_t header_size = 4 + 8 + 8; /* original_size + prev_offset + next_offset */ + const size_t total_size = header_size + compressed_size; + uint8_t *block_data = malloc(total_size); + if (!block_data) + { + free(compressed); + free(new_offsets); + return -1; + } + encode_uint32_le_compat(block_data, original_size); + int64_t placeholder_prev = -1; + int64_t placeholder_next = -1; + encode_int64_le_compat(block_data + 4, placeholder_prev); + encode_int64_le_compat(block_data + 12, placeholder_next); + memcpy(block_data + header_size, compressed, compressed_size); + free(compressed); + + block_manager_block_t *new_block = block_manager_block_create(total_size, block_data); + free(block_data); + + if (!new_block) + { + free(new_offsets); + return -1; + } + + const int64_t new_offset = block_manager_block_write(builder->bm, new_block); + block_manager_block_free(new_block); + + if (new_offset < 0) + { + free(new_offsets); + return -1; + } + + new_offsets[i] = new_offset; + } + + /* we patch prev_offset and next_offset in header and update checksum */ + for (uint32_t i = 0; i < builder->num_leaf_offsets; i++) + { + /* header format -- [original_size:4][prev_offset:8][next_offset:8][compressed_data] */ + /* block format -- [block_size:4][checksum:4][data...] where data starts with our + * header + */ + const int64_t prev_patch_offset = new_offsets[i] + BLOCK_MANAGER_BLOCK_HEADER_SIZE + 4; + const int64_t next_patch_offset = new_offsets[i] + BLOCK_MANAGER_BLOCK_HEADER_SIZE + 12; + + /* we patch prev_offset (first leaf has prev=-1, others point to previous new offset) */ + int64_t prev_leaf_offset = (i == 0) ? -1 : new_offsets[i - 1]; + if (block_manager_write_at(builder->bm, prev_patch_offset, (uint8_t *)&prev_leaf_offset, + 8) != 0) + { + free(new_offsets); + return -1; + } + + /* we patch next_offset (last leaf has next=-1, others point to next new offset) */ + int64_t next_leaf_offset = + (i + 1 < builder->num_leaf_offsets) ? new_offsets[i + 1] : -1; + if (block_manager_write_at(builder->bm, next_patch_offset, (uint8_t *)&next_leaf_offset, + 8) != 0) + { + free(new_offsets); + return -1; + } + + /* we update checksum after patching the block data */ + if (block_manager_update_checksum(builder->bm, new_offsets[i]) != 0) + { + free(new_offsets); + return -1; + } + } + + /* we must update leaf_offsets and level_entries with new locations */ + for (uint32_t i = 0; i < builder->num_leaf_offsets; i++) + { + builder->leaf_offsets[i] = new_offsets[i]; + } + for (uint32_t i = 0; i < builder->num_level_entries && i < builder->num_leaf_offsets; i++) + { + builder->level_entries[i].child_offset = new_offsets[i]; + } + + builder->first_leaf_offset = new_offsets[0]; + builder->last_leaf_offset = new_offsets[builder->num_leaf_offsets - 1]; + + free(new_offsets); + } + + return 0; +} + +int btree_builder_finish(btree_builder_t *builder, btree_t **tree) +{ + if (!builder || !tree) return -1; + + if (builder->current_leaf && builder->current_leaf->num_entries > 0) + { + if (btree_builder_flush_leaf(builder) != 0) + { + return -1; + } + } + + if (btree_builder_backpatch_leaf_links(builder) != 0) + { + return -1; + } + + int64_t root_offset = -1; + if (btree_builder_build_internal_levels(builder, &root_offset) != 0) + { + return -1; + } + + btree_t *t = calloc(1, sizeof(btree_t)); + if (!t) return -1; + + t->bm = builder->bm; + t->config = builder->config; + t->root_offset = root_offset; + t->first_leaf_offset = builder->first_leaf_offset; + t->last_leaf_offset = builder->last_leaf_offset; + t->entry_count = builder->entry_count; + t->node_count = builder->node_count; + t->max_seq = builder->max_seq; + t->height = builder->height ? builder->height : 1; + + if (builder->min_key) + { + t->min_key = builder->min_key; + t->min_key_size = builder->min_key_size; + builder->min_key = NULL; + } + + if (builder->max_key) + { + t->max_key = builder->max_key; + t->max_key_size = builder->max_key_size; + builder->max_key = NULL; + } + + *tree = t; + return 0; +} + +void btree_builder_free(btree_builder_t *builder) +{ + if (!builder) return; + + /* drop the temp leaf-staging file (only created when compression is on) */ + if (builder->leaf_bm && builder->leaf_bm != builder->bm) + { + char tmp_path[MAX_FILE_PATH_LENGTH]; + snprintf(tmp_path, sizeof(tmp_path), "%s", builder->leaf_bm->file_path); + block_manager_close(builder->leaf_bm); + remove(tmp_path); + } + + btree_pending_leaf_free(builder->current_leaf); + + free(builder->leaf_offsets); + + if (builder->level_entries) + { + for (uint32_t i = 0; i < builder->num_level_entries; i++) + { + free(builder->level_entries[i].key); + } + free(builder->level_entries); + } + + free(builder->min_key); + free(builder->max_key); + free(builder); +} + +int btree_open(btree_t **tree, block_manager_t *bm, const btree_config_t *config, + const int64_t root_offset, const int64_t first_leaf_offset, + const int64_t last_leaf_offset) +{ + if (!tree || !bm || !config) return -1; + + btree_t *t = calloc(1, sizeof(btree_t)); + if (!t) return -1; + + t->bm = bm; + t->config = *config; + t->root_offset = root_offset; + t->first_leaf_offset = first_leaf_offset; + t->last_leaf_offset = last_leaf_offset; + + if (!t->config.comparator) + { + t->config.comparator = btree_comparator_memcmp; + t->config.cmp_type = BTREE_CMP_MEMCMP; + } + + *tree = t; + return 0; +} + +int btree_get_at_seq(btree_t *tree, const uint8_t *key, const size_t key_size, + const uint64_t seq_ceiling, uint8_t **value, size_t *value_size, + uint64_t *vlog_offset, uint64_t *seq, int64_t *ttl, uint8_t *deleted) +{ + if (!tree || !key || key_size == 0) return -1; + + if (tree->root_offset < 0) return -1; + + const int using_cache = (tree->node_cache != NULL); + + btree_node_t *node = NULL; + if (btree_node_read_cached(tree, tree->root_offset, &node) != 0) + { + return -1; + } + + while (node->type == BTREE_NODE_INTERNAL) + { + /* we utilize binary search for child index in internal node + * find the largest i where key >= keys[i], then child_idx = i + 1 + * if key < keys[0], child_idx = 0. separator keys are strictly + * increasing -- the builder never splits a key's run across leaves -- + * so a key's whole run lives in the one child this routes to. */ + uint32_t child_idx = 0; + if (node->num_entries > 0) + { + int32_t lo = 0; + int32_t hi = (int32_t)node->num_entries - 1; + while (lo <= hi) + { + const int32_t mid = lo + (hi - lo) / 2; + const int cmp = btree_compare_keys_inline(&tree->config, key, key_size, + node->keys[mid], node->key_sizes[mid]); + if (cmp < 0) + { + hi = mid - 1; + } + else + { + lo = mid + 1; + } + } + child_idx = (uint32_t)lo; + } + + const int64_t child_offset = node->child_offsets[child_idx]; + + btree_node_done(node, using_cache); + + if (btree_node_read_cached(tree, child_offset, &node) != 0) + { + return -1; + } + } + + /* lower_bound -- leftmost index whose key is >= the search key */ + int32_t lo = 0; + int32_t hi = (int32_t)node->num_entries; + while (lo < hi) + { + const int32_t mid = lo + (hi - lo) / 2; + const int cmp = btree_compare_keys_inline(&tree->config, key, key_size, node->keys[mid], + node->key_sizes[mid]); + if (cmp <= 0) + { + hi = mid; + } + else + { + lo = mid + 1; + } + } + + /* scan the run of entries that share the search key, keeping the highest + * seq that does not exceed seq_ceiling. a key may have several versions -- + * a flush or compaction retains a version chain -- and they all live in + * this one leaf, so the resolved version is the one visible at the + * caller's snapshot. */ + int32_t found_idx = -1; + for (int32_t i = lo; i < (int32_t)node->num_entries; i++) + { + if (btree_compare_keys_inline(&tree->config, key, key_size, node->keys[i], + node->key_sizes[i]) != 0) + { + break; + } + const uint64_t entry_seq = node->entries[i].seq; + if (entry_seq > seq_ceiling) continue; + if (found_idx < 0 || entry_seq > node->entries[found_idx].seq) + { + found_idx = i; + } + } + + if (found_idx < 0) + { + btree_node_done(node, using_cache); + return -1; + } + + const btree_entry_t *entry = &node->entries[found_idx]; + + if (value && value_size) + { + if (entry->vlog_offset == 0 && node->values[found_idx]) + { + *value = malloc(entry->value_size); + if (*value) + { + memcpy(*value, node->values[found_idx], entry->value_size); + } + *value_size = entry->value_size; + } + else + { + *value = NULL; + *value_size = entry->value_size; + } + } + + if (vlog_offset) *vlog_offset = entry->vlog_offset; + if (seq) *seq = entry->seq; + if (ttl) *ttl = entry->ttl; + /* deleted returns the persisted tombstone/single-delete bits so compaction + * can distinguish single-delete from regular delete. the low bit still + * equals BTREE_ENTRY_FLAG_TOMBSTONE, so callers that treat *deleted as a + * bool keep working unchanged. */ + if (deleted) + *deleted = entry->flags & (BTREE_ENTRY_FLAG_TOMBSTONE | BTREE_ENTRY_FLAG_SINGLE_DELETE); + + btree_node_done(node, using_cache); + return 0; +} + +int btree_get(btree_t *tree, const uint8_t *key, const size_t key_size, uint8_t **value, + size_t *value_size, uint64_t *vlog_offset, uint64_t *seq, int64_t *ttl, + uint8_t *deleted) +{ + return btree_get_at_seq(tree, key, key_size, UINT64_MAX, value, value_size, vlog_offset, seq, + ttl, deleted); +} + +uint64_t btree_get_entry_count(const btree_t *tree) +{ + return tree ? tree->entry_count : 0; +} + +int btree_get_min_key(btree_t *tree, uint8_t **key, size_t *key_size) +{ + if (!tree || !key || !key_size) return -1; + if (!tree->min_key) return -1; + + *key = malloc(tree->min_key_size); + if (!*key) return -1; + memcpy(*key, tree->min_key, tree->min_key_size); + *key_size = tree->min_key_size; + return 0; +} + +int btree_get_max_key(btree_t *tree, uint8_t **key, size_t *key_size) +{ + if (!tree || !key || !key_size) return -1; + if (!tree->max_key) return -1; + + *key = malloc(tree->max_key_size); + if (!*key) return -1; + memcpy(*key, tree->max_key, tree->max_key_size); + *key_size = tree->max_key_size; + return 0; +} + +uint64_t btree_get_max_seq(const btree_t *tree) +{ + return tree ? tree->max_seq : 0; +} + +int btree_get_stats(const btree_t *tree, btree_stats_t *stats) +{ + if (!tree || !stats) return -1; + + stats->entry_count = tree->entry_count; + stats->node_count = tree->node_count; + stats->height = tree->height; + + /* we get serialized size from block manager if available */ + stats->serialized_size = 0; + if (tree->bm) + { + uint64_t size; + if (block_manager_get_size(tree->bm, &size) == 0) + { + stats->serialized_size = size; + } + } + + return 0; +} + +void btree_free(btree_t *tree) +{ + if (!tree) return; + free(tree->min_key); + free(tree->max_key); + if (tree->node_arena) + { + btree_arena_destroy(tree->node_arena); + } + free(tree); +} + +void btree_set_node_cache(btree_t *tree, clock_cache_t *cache) +{ + if (tree) + { + tree->node_cache = cache; + } +} + +/** + * btree_create_node_cache + * creates a node cache with the proper eviction callback + * @param max_bytes maximum cache size in bytes + * @return new cache or NULL on failure + */ +clock_cache_t *btree_create_node_cache(const size_t max_bytes) +{ + cache_config_t config = {0}; + config.avg_entry_size = BTREE_DEFAULT_NODE_SIZE; + clock_cache_compute_config(max_bytes, &config); + config.evict_callback = btree_node_cache_evict_callback; + return clock_cache_create(&config); +} + +/** + * btree_print_node + * recursively prints a node and its children for debugging + * @param tree the btree instance + * @param offset node offset in storage + * @param depth current depth for indentation + */ +static void btree_print_node(btree_t *tree, const int64_t offset, const int depth) +{ + if (offset < 0) return; + + btree_node_t *node = NULL; + if (btree_node_read_with_compression(tree->bm, offset, &node, tree->config.compression_algo) != + 0) + { + printf("%*s[ERROR reading node at offset %" PRId64 "]\n", depth * 2, "", offset); + return; + } + + if (node->type == BTREE_NODE_INTERNAL) + { + printf("%*sINTERNAL (offset=%" PRId64 ", keys=%u, children=%u)\n", depth * 2, "", offset, + node->num_entries, node->num_entries + 1); + + for (uint32_t i = 0; i < node->num_entries; i++) + { + printf("%*s key[%u]: \"%.20s%s\" (size=%zu)\n", depth * 2, "", i, + (char *)node->keys[i], node->key_sizes[i] > 20 ? "..." : "", node->key_sizes[i]); + } + + for (uint32_t i = 0; i <= node->num_entries; i++) + { + printf("%*s child[%u] -> offset %" PRId64 "\n", depth * 2, "", i, + node->child_offsets[i]); + btree_print_node(tree, node->child_offsets[i], depth + 1); + } + } + else + { + printf("%*sLEAF (offset=%" PRId64 ", entries=%u, prev=%" PRId64 ", next=%" PRId64 ")\n", + depth * 2, "", offset, node->num_entries, node->prev_offset, node->next_offset); + + for (uint32_t i = 0; i < node->num_entries && i < 5; i++) + { + printf("%*s [%u] key=\"%.20s%s\" seq=%" PRIu64 "\n", depth * 2, "", i, + (char *)node->keys[i], node->key_sizes[i] > 20 ? "..." : "", + node->entries[i].seq); + } + if (node->num_entries > 5) + { + printf("%*s ... (%u more entries)\n", depth * 2, "", node->num_entries - 5); + } + } + + btree_node_free(node); +} + +void btree_print_tree(btree_t *tree) +{ + if (!tree) + { + printf("btree_print_tree: NULL tree\n"); + return; + } + + printf("--- B+Tree Structure ---\n"); + printf("entry_count: %" PRIu64 "\n", tree->entry_count); + printf("node_count: %" PRIu64 "\n", tree->node_count); + printf("height: %u\n", tree->height); + printf("root_offset: %" PRId64 "\n", tree->root_offset); + printf("first_leaf_offset: %" PRId64 "\n", tree->first_leaf_offset); + printf("last_leaf_offset: %" PRId64 "\n", tree->last_leaf_offset); + + if (tree->min_key) + { + printf("min_key: \"%.30s%s\"\n", (char *)tree->min_key, + tree->min_key_size > 30 ? "..." : ""); + } + if (tree->max_key) + { + printf("max_key: \"%.30s%s\"\n", (char *)tree->max_key, + tree->max_key_size > 30 ? "..." : ""); + } + + printf("\nTree structure:\n"); + btree_print_node(tree, tree->root_offset, 0); + printf("-----------------------\n"); +} + +int btree_cursor_init(btree_cursor_t **cursor, btree_t *tree) +{ + if (!cursor || !tree) return -1; + + btree_cursor_t *c = calloc(1, sizeof(btree_cursor_t)); + if (!c) return -1; + + c->tree = tree; + c->current_node = NULL; + c->current_index = -1; + c->current_leaf_offset = -1; + c->at_end = 0; + c->at_begin = 0; + c->using_cache = (tree->node_cache != NULL); + + *cursor = c; + + return btree_cursor_goto_first(c); +} + +int btree_cursor_goto_first(btree_cursor_t *cursor) +{ + if (!cursor || !cursor->tree) return -1; + + if (cursor->current_node) + { + btree_node_done(cursor->current_node, cursor->using_cache); + cursor->current_node = NULL; + } + + if (cursor->tree->first_leaf_offset < 0) + { + cursor->at_end = 1; + return -1; + } + + cursor->current_leaf_offset = cursor->tree->first_leaf_offset; + if (btree_node_read_cached(cursor->tree, cursor->current_leaf_offset, &cursor->current_node) != + 0) + { + return -1; + } + + cursor->current_index = 0; + cursor->at_end = (cursor->current_node->num_entries == 0); + cursor->at_begin = 0; + return cursor->at_end ? -1 : 0; +} + +int btree_cursor_goto_last(btree_cursor_t *cursor) +{ + if (!cursor || !cursor->tree) return -1; + + if (cursor->current_node) + { + btree_node_done(cursor->current_node, cursor->using_cache); + cursor->current_node = NULL; + } + + if (cursor->tree->last_leaf_offset < 0) + { + cursor->at_end = 1; + return -1; + } + + cursor->current_leaf_offset = cursor->tree->last_leaf_offset; + if (btree_node_read_cached(cursor->tree, cursor->current_leaf_offset, &cursor->current_node) != + 0) + { + return -1; + } + + cursor->current_index = (int32_t)cursor->current_node->num_entries - 1; + cursor->at_end = (cursor->current_index < 0); + cursor->at_begin = 0; + return cursor->at_end ? -1 : 0; +} + +int btree_cursor_next(btree_cursor_t *cursor) +{ + if (!cursor || cursor->at_end) return -1; + + if (!cursor->current_node) + { + return btree_cursor_goto_first(cursor); + } + + cursor->current_index++; + + if ((uint32_t)cursor->current_index >= cursor->current_node->num_entries) + { + const int64_t next_leaf_offset = cursor->current_node->next_offset; + + if (next_leaf_offset < 0) + { + cursor->at_end = 1; + return -1; + } + + btree_node_done(cursor->current_node, cursor->using_cache); + cursor->current_node = NULL; + + cursor->current_leaf_offset = next_leaf_offset; + if (btree_node_read_cached(cursor->tree, cursor->current_leaf_offset, + &cursor->current_node) != 0) + { + cursor->at_end = 1; + return -1; + } + + cursor->current_index = 0; + + if (cursor->current_node->num_entries == 0) + { + cursor->at_end = 1; + return -1; + } + } + + return 0; +} + +int btree_cursor_prev(btree_cursor_t *cursor) +{ + if (!cursor) return -1; + + if (!cursor->current_node) + { + return btree_cursor_goto_last(cursor); + } + + cursor->current_index--; + + if (cursor->current_index < 0) + { + const int64_t prev_leaf_offset = cursor->current_node->prev_offset; + + if (prev_leaf_offset < 0) + { + /* we reached beginning */ + cursor->current_index = -1; + cursor->at_begin = 1; + return -1; + } + + btree_node_done(cursor->current_node, cursor->using_cache); + cursor->current_node = NULL; + + cursor->current_leaf_offset = prev_leaf_offset; + if (btree_node_read_cached(cursor->tree, cursor->current_leaf_offset, + &cursor->current_node) != 0) + { + cursor->at_begin = 1; + return -1; + } + + cursor->current_index = (int32_t)cursor->current_node->num_entries - 1; + + if (cursor->current_index < 0) + { + cursor->at_begin = 1; + return -1; + } + } + + return 0; +} + +int btree_cursor_seek(btree_cursor_t *cursor, const uint8_t *key, const size_t key_size) +{ + if (!cursor || !cursor->tree || !key || key_size == 0) return -1; + + if (cursor->current_node) + { + btree_node_done(cursor->current_node, cursor->using_cache); + } + cursor->current_node = NULL; + + if (cursor->tree->root_offset < 0) + { + cursor->at_end = 1; + return -1; + } + + btree_node_t *node = NULL; + if (btree_node_read_cached(cursor->tree, cursor->tree->root_offset, &node) != 0) + { + return -1; + } + + while (node->type == BTREE_NODE_INTERNAL) + { + /* we utilize binary search for child index in internal node */ + uint32_t child_idx = 0; + if (node->num_entries > 0) + { + int32_t lo = 0; + int32_t hi = (int32_t)node->num_entries - 1; + while (lo <= hi) + { + const int32_t mid = lo + (hi - lo) / 2; + const int cmp = btree_compare_keys_inline(&cursor->tree->config, key, key_size, + node->keys[mid], node->key_sizes[mid]); + if (cmp < 0) + { + hi = mid - 1; + } + else + { + lo = mid + 1; + } + } + child_idx = (uint32_t)lo; + } + + const int64_t child_offset = node->child_offsets[child_idx]; + btree_node_done(node, cursor->using_cache); + + if (btree_node_read_cached(cursor->tree, child_offset, &node) != 0) + { + return -1; + } + } + + int32_t left = 0; + int32_t right = (int32_t)node->num_entries - 1; + int32_t found_idx = -1; + + while (left <= right) + { + const int32_t mid = left + (right - left) / 2; + const int cmp = btree_compare_keys_inline(&cursor->tree->config, key, key_size, + node->keys[mid], node->key_sizes[mid]); + if (cmp == 0) + { + found_idx = mid; + break; + } + if (cmp < 0) + { + right = mid - 1; + } + else + { + left = mid + 1; + } + } + + if (found_idx < 0) + { + found_idx = left; + } + + if ((uint32_t)found_idx >= node->num_entries) + { + if (node->next_offset >= 0) + { + const int64_t next_off = node->next_offset; + btree_node_done(node, cursor->using_cache); + if (btree_node_read_cached(cursor->tree, next_off, &node) != 0) + { + cursor->at_end = 1; + return -1; + } + found_idx = 0; + } + else + { + btree_node_done(node, cursor->using_cache); + cursor->at_end = 1; + return -1; + } + } + + cursor->current_node = node; + cursor->current_index = found_idx; + cursor->current_leaf_offset = node->block_offset; + cursor->at_end = 0; + cursor->at_begin = 0; + return 0; +} + +int btree_cursor_seek_for_prev(btree_cursor_t *cursor, const uint8_t *key, const size_t key_size) +{ + if (!cursor || !cursor->tree || !key || key_size == 0) return -1; + + if (btree_cursor_seek(cursor, key, key_size) != 0) + { + return btree_cursor_goto_last(cursor); + } + + const int cmp = btree_compare_keys_inline( + &cursor->tree->config, key, key_size, cursor->current_node->keys[cursor->current_index], + cursor->current_node->key_sizes[cursor->current_index]); + + if (cmp < 0) + { + return btree_cursor_prev(cursor); + } + + return 0; +} + +int btree_cursor_valid(btree_cursor_t *cursor) +{ + if (!cursor) return -1; + if (cursor->at_end) return 0; + if (!cursor->current_node) return 0; + if (cursor->current_index < 0) return 0; + if ((uint32_t)cursor->current_index >= cursor->current_node->num_entries) return 0; + return 1; +} + +int btree_cursor_get(btree_cursor_t *cursor, uint8_t **key, size_t *key_size, uint8_t **value, + size_t *value_size, uint64_t *vlog_offset, uint64_t *seq, int64_t *ttl, + uint8_t *deleted) +{ + if (!cursor || !cursor->current_node) return -1; + if (cursor->current_index < 0 || + (uint32_t)cursor->current_index >= cursor->current_node->num_entries) + { + return -1; + } + + const uint32_t idx = (uint32_t)cursor->current_index; + const btree_entry_t *entry = &cursor->current_node->entries[idx]; + + if (key) *key = cursor->current_node->keys[idx]; + if (key_size) *key_size = cursor->current_node->key_sizes[idx]; + if (value) *value = cursor->current_node->values[idx]; + if (value_size) *value_size = entry->value_size; + if (vlog_offset) *vlog_offset = entry->vlog_offset; + if (seq) *seq = entry->seq; + if (ttl) *ttl = entry->ttl; + /* deleted returns the persisted tombstone/single-delete bits so compaction + * can distinguish single-delete from regular delete. the low bit still + * equals BTREE_ENTRY_FLAG_TOMBSTONE, so callers that treat *deleted as a + * bool keep working unchanged. */ + if (deleted) + *deleted = entry->flags & (BTREE_ENTRY_FLAG_TOMBSTONE | BTREE_ENTRY_FLAG_SINGLE_DELETE); + + return 0; +} + +int btree_cursor_has_next(btree_cursor_t *cursor) +{ + if (!cursor) return -1; + if (cursor->at_end) return 0; + if (!cursor->current_node) return 1; + + if ((uint32_t)(cursor->current_index + 1) < cursor->current_node->num_entries) + { + return 1; + } + + return (cursor->current_node->next_offset >= 0) ? 1 : 0; +} + +int btree_cursor_has_prev(btree_cursor_t *cursor) +{ + if (!cursor) return -1; + if (!cursor->current_node) return 0; + + if (cursor->current_index > 0) + { + return 1; + } + + return (cursor->current_node->prev_offset >= 0) ? 1 : 0; +} + +void btree_cursor_free(btree_cursor_t *cursor) +{ + if (!cursor) return; + btree_node_done(cursor->current_node, cursor->using_cache); + free(cursor); +} diff --git a/storage/tidesdb/libtidesdb/src/btree.h b/storage/tidesdb/libtidesdb/src/btree.h new file mode 100644 index 0000000000000..ae76fb47e497b --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/btree.h @@ -0,0 +1,689 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __BTREE_H__ +#define __BTREE_H__ + +#include "block_manager.h" +#include "clock_cache.h" +#include "compat.h" + +/* branch prediction hints */ +#if defined(__GNUC__) || defined(__clang__) +#define BTREE_LIKELY(x) __builtin_expect(!!(x), 1) +#define BTREE_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define BTREE_LIKELY(x) (x) +#define BTREE_UNLIKELY(x) (x) +#endif + +/* magic number "BTR+" in hex */ +#define BTREE_MAGIC 0x4254522B +#define BTREE_VERSION 1 + +/* btree clock-cache key layout. each key is "" where + * cache_key_prefix is set by tidesdb_sstable_create. exposed in the header so cache + * invalidation paths in tidesdb.c can build matching prefixes. */ +#define BTREE_CACHE_KEY_SIZE 32 +#define BTREE_CACHE_KEY_SEPARATOR ':' + +/* node type flags */ +#define BTREE_NODE_LEAF 0x01 +#define BTREE_NODE_INTERNAL 0x02 + +/* entry flags (matching TidesDB kv flags) */ +#define BTREE_ENTRY_FLAG_TOMBSTONE 0x01 +#define BTREE_ENTRY_FLAG_HAS_TTL 0x02 +#define BTREE_ENTRY_FLAG_VLOG_REF 0x04 /* value is in vlog, not inline */ +#define BTREE_ENTRY_FLAG_SINGLE_DELETE \ + 0x10 /* single-delete tombstone subtype, \ + * always set alongside \ + * BTREE_ENTRY_FLAG_TOMBSTONE */ + +/* default configuration */ +#define BTREE_DEFAULT_NODE_SIZE (64 * 1024) /* 64KB target node size */ +#define BTREE_DEFAULT_FANOUT 256 /* target keys per internal node */ +#define BTREE_MIN_ENTRIES_PER_LEAF 2 + +/* block types for metadata */ +#define BTREE_BLOCK_TYPE_META 0x00 +#define BTREE_BLOCK_TYPE_LEAF 0x01 +#define BTREE_BLOCK_TYPE_INTERNAL 0x02 + +/* forward declarations */ +typedef struct btree_t btree_t; +typedef struct btree_builder_t btree_builder_t; +typedef struct btree_cursor_t btree_cursor_t; +typedef struct btree_node_t btree_node_t; +typedef struct btree_entry_t btree_entry_t; +typedef struct btree_arena_t btree_arena_t; + +/** + * btree_arena_t + * simple arena allocator for btree nodes to reduce malloc/free overhead + * allocations are bump-pointer style, freed all at once when arena is destroyed + */ +#define BTREE_ARENA_BLOCK_SIZE (64 * 1024) /* 64KB blocks */ +#define BTREE_ARENA_MIN_BLOCK_SIZE 256 /* minimum arena block size */ + +typedef struct btree_arena_block_t +{ + uint8_t *data; + size_t size; + size_t used; + struct btree_arena_block_t *next; +} btree_arena_block_t; + +/* + * btree_arena_t + * simple arena allocator for btree nodes to reduce malloc/free overhead + * allocations are bump-pointer style, freed all at once when arena is destroyed + * @param current current block + * @param blocks linked list of blocks + * @param total_allocated total bytes allocated + */ +struct btree_arena_t +{ + btree_arena_block_t *current; + btree_arena_block_t *blocks; + size_t total_allocated; +}; + +/** + * btree_arena_create + * creates a new arena allocator with default block size (64KB) + * @return new arena or NULL on failure + */ +btree_arena_t *btree_arena_create(void); + +/** + * btree_arena_create_sized + * creates a new arena allocator with a specific initial capacity + * used to right-size arenas for deserialized nodes to reduce memory waste + * @param initial_capacity initial block size in bytes (clamped to minimum 256) + * @return new arena or NULL on failure + */ +btree_arena_t *btree_arena_create_sized(size_t initial_capacity); + +/** + * btree_arena_alloc + * allocates memory from the arena (8-byte aligned) + * @param arena the arena + * @param size bytes to allocate + * @return pointer to allocated memory or NULL on failure + */ +void *btree_arena_alloc(btree_arena_t *arena, size_t size); + +/** + * btree_arena_destroy + * destroys the arena and frees all memory + * @param arena the arena to destroy + */ +void btree_arena_destroy(btree_arena_t *arena); + +/** + * btree_arena_reset + * resets the arena for reuse (keeps allocated blocks) + * @param arena the arena to reset + */ +void btree_arena_reset(btree_arena_t *arena); + +/** + * btree_cmp_type_t + * comparator type enum (mirrors skip_list) + */ +typedef enum +{ + BTREE_CMP_MEMCMP = 0, /* default memcmp-based comparison */ + BTREE_CMP_STRING, /* string-based comparison */ + BTREE_CMP_NUMERIC, /* numeric comparison (8-byte keys) */ + BTREE_CMP_CUSTOM /* custom comparator function */ +} btree_cmp_type_t; + +/** + * btree_comparator_fn + * comparator function type (same signature as skip_list) + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +typedef int (*btree_comparator_fn)(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * btree_entry_t + * a single key-value entry in a leaf node + * @param key_size size of key + * @param value_size size of value (inline or in vlog) + * @param vlog_offset offset in vlog if value is external (0 = inline) + * @param seq sequence number + * @param ttl time-to-live (0 = no expiry) + * @param flags entry flags (tombstone, has_ttl, vlog_ref) + */ +struct btree_entry_t +{ + uint32_t key_size; + uint32_t value_size; + uint64_t vlog_offset; + uint64_t seq; + int64_t ttl; + uint8_t flags; +}; + +/** + * btree_node_t + * in-memory representation of a B+tree node + * @param type node type (leaf or internal) + * @param num_entries number of entries/children + * @param entries array of entries (leaf nodes only) + * @param keys array of key pointers + * @param key_sizes array of key sizes + * @param values array of inline value pointers (leaf nodes only) + * @param child_offsets array of child block offsets (internal nodes only) + * @param prev_offset offset of previous sibling (leaf nodes, for backward scan) + * @param next_offset offset of next sibling (leaf nodes, for forward scan) + * @param block_offset this node's offset in the file + * @param arena arena for cached node allocations (owned by btree, created with cache) + * @param rc_count reference count for cached nodes (0 = not ref-counted) + */ +struct btree_node_t +{ + uint8_t type; + uint32_t num_entries; + btree_entry_t *entries; + uint8_t **keys; + size_t *key_sizes; + uint8_t **values; + int64_t *child_offsets; + int64_t prev_offset; + int64_t next_offset; + int64_t block_offset; + btree_arena_t *arena; + atomic_int rc_count; +}; + +/** + * btree_config_t + * configuration for B+tree construction + * @param target_node_size target size for nodes in bytes + * @param value_threshold values >= this size go to vlog + * @param comparator comparator function + * @param comparator_ctx comparator context + * @param cmp_type comparator type + * @param compression_algo compression algorithm (0=none, 2=lz4, 3=zstd, 4=lz4_fast) + */ +typedef struct +{ + size_t target_node_size; + size_t value_threshold; + btree_comparator_fn comparator; + void *comparator_ctx; + btree_cmp_type_t cmp_type; + int compression_algo; +} btree_config_t; + +/** + * btree_t + * immutable B+tree structure (read-only after construction) + * @param bm block manager for storage + * @param root_offset offset of root node + * @param first_leaf_offset offset of first leaf (for forward iteration) + * @param last_leaf_offset offset of last leaf (for backward iteration) + * @param entry_count total number of entries + * @param node_count total number of nodes + * @param height tree height + * @param config configuration + * @param min_key minimum key in tree + * @param min_key_size size of minimum key + * @param max_key maximum key in tree + * @param max_key_size size of maximum key + * @param max_seq maximum sequence number + * @param node_cache node cache for fast lookups (optional, can be NULL) + * @param node_arena arena for cached node allocations (owned by btree, created with cache) + * @param cache_key_prefix precomputed cache key prefix for this btree's node cache entries + */ +struct btree_t +{ + block_manager_t *bm; + int64_t root_offset; + int64_t first_leaf_offset; + int64_t last_leaf_offset; + uint64_t entry_count; + uint64_t node_count; + uint32_t height; + btree_config_t config; + uint8_t *min_key; + size_t min_key_size; + uint8_t *max_key; + size_t max_key_size; + uint64_t max_seq; + clock_cache_t *node_cache; + btree_arena_t *node_arena; + uint64_t cache_key_prefix; +}; + +/** + * btree_stats_t + * statistics for a single B+tree (per-sstable) + * @param entry_count total number of entries + * @param node_count total number of nodes + * @param height tree height (1 = single leaf, 2+ = has internal nodes) + * @param serialized_size total bytes on disk + */ +typedef struct +{ + uint64_t entry_count; + uint64_t node_count; + uint32_t height; + uint64_t serialized_size; +} btree_stats_t; + +/** + * btree_cursor_t + * cursor for iterating through the B+tree + * uses tree traversal for leaf-to-leaf navigation (memory efficient) + * @param tree pointer to the B+tree + * @param current_node current leaf node + * @param current_index index within current node + * @param current_leaf_offset offset of current leaf node + * @param at_end flag indicating cursor is past end + * @param at_begin flag indicating cursor is before begin + * @param using_cache flag indicating current node was loaded from cache + */ +struct btree_cursor_t +{ + btree_t *tree; + btree_node_t *current_node; + int32_t current_index; + int64_t current_leaf_offset; + int at_end; + int at_begin; + int using_cache; +}; + +/** + * btree_comparator_memcmp + * default memcmp-based comparator + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer (unused) + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int btree_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * btree_comparator_string + * string-based comparator + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer (unused) + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int btree_comparator_string(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * btree_comparator_numeric + * numeric comparator for 8-byte keys + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer (unused) + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int btree_comparator_numeric(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * btree_builder_new + * creates a new B+tree builder for sorted data insertion + * @param builder output pointer to builder + * @param bm block manager for storage + * @param config configuration (comparator, node size, value threshold) + * @return 0 on success, -1 on failure + */ +int btree_builder_new(btree_builder_t **builder, block_manager_t *bm, const btree_config_t *config); + +/** + * btree_builder_add + * adds an entry to the B+tree (must be called in sorted key order) + * @param builder the builder + * @param key key data + * @param key_size size of key + * @param value value data (NULL for tombstones) + * @param value_size size of value + * @param vlog_offset vlog offset if value is external (0 = inline) + * @param seq sequence number + * @param ttl time-to-live (0 = no expiry) + * @param entry_flags bitmask of BTREE_ENTRY_FLAG_* to persist on this entry + * (TOMBSTONE, SINGLE_DELETE). HAS_TTL and VLOG_REF are + * derived from ttl and vlog_offset. passing 1 (a bare + * tombstone) stays valid because 1 == TOMBSTONE. + * @return 0 on success, -1 on failure + */ +int btree_builder_add(btree_builder_t *builder, const uint8_t *key, size_t key_size, + const uint8_t *value, size_t value_size, uint64_t vlog_offset, uint64_t seq, + int64_t ttl, uint8_t entry_flags); + +/** + * btree_builder_finish + * finalizes the B+tree construction + * @param builder the builder + * @param tree output pointer to completed tree + * @return 0 on success, -1 on failure + */ +int btree_builder_finish(btree_builder_t *builder, btree_t **tree); + +/** + * btree_builder_free + * frees builder resources (call after finish or on error) + * @param builder the builder to free + */ +void btree_builder_free(btree_builder_t *builder); + +/** + * btree_open + * opens an existing B+tree from storage + * tidesdb core reads sstable metadata and passes offsets to btree + * @param tree output pointer to tree + * @param bm block manager containing the tree + * @param config configuration (comparator must match what was used to build) + * @param root_offset offset of root node (from sstable metadata) + * @param first_leaf_offset offset of first leaf for forward iteration + * @param last_leaf_offset offset of last leaf for backward iteration + * @return 0 on success, -1 on failure + */ +int btree_open(btree_t **tree, block_manager_t *bm, const btree_config_t *config, + int64_t root_offset, int64_t first_leaf_offset, int64_t last_leaf_offset); + +/** + * btree_get_at_seq + * retrieves the version of a key visible at a sequence ceiling. a key may have + * several versions in one tree (a flush or compaction retains a version chain); + * this returns the one with the highest seq that does not exceed seq_ceiling, + * or -1 if the key has no version at or below it. + * @param tree the B+tree + * @param key key data + * @param key_size size of key + * @param seq_ceiling highest sequence number to consider (UINT64_MAX = newest) + * @param value output pointer to value (caller must free) + * @param value_size output value size + * @param vlog_offset output vlog offset (0 if inline) + * @param seq output sequence number + * @param ttl output time-to-live + * @param deleted output tombstone flag + * @return 0 on success, -1 on not found or error + */ +int btree_get_at_seq(btree_t *tree, const uint8_t *key, size_t key_size, uint64_t seq_ceiling, + uint8_t **value, size_t *value_size, uint64_t *vlog_offset, uint64_t *seq, + int64_t *ttl, uint8_t *deleted); + +/** + * btree_get + * retrieves the newest version of a key (equivalent to btree_get_at_seq with + * seq_ceiling = UINT64_MAX) + * @param tree the B+tree + * @param key key data + * @param key_size size of key + * @param value output pointer to value (caller must free) + * @param value_size output value size + * @param vlog_offset output vlog offset (0 if inline) + * @param seq output sequence number + * @param ttl output time-to-live + * @param deleted output tombstone flag + * @return 0 on success, -1 on not found or error + */ +int btree_get(btree_t *tree, const uint8_t *key, size_t key_size, uint8_t **value, + size_t *value_size, uint64_t *vlog_offset, uint64_t *seq, int64_t *ttl, + uint8_t *deleted); + +/** + * btree_get_entry_count + * returns total number of entries + */ +uint64_t btree_get_entry_count(const btree_t *tree); + +/** + * btree_get_min_key + * gets the minimum key + * @param tree the B+tree + * @param key output pointer to key (caller must free) + * @param key_size output key size + * @return 0 on success, -1 on failure + */ +int btree_get_min_key(btree_t *tree, uint8_t **key, size_t *key_size); + +/** + * btree_get_max_key + * gets the maximum key + * @param tree the B+tree + * @param key output pointer to key (caller must free) + * @param key_size output key size + * @return 0 on success, -1 on failure + */ +int btree_get_max_key(btree_t *tree, uint8_t **key, size_t *key_size); + +/** + * btree_get_max_seq + * returns maximum sequence number in tree + */ +uint64_t btree_get_max_seq(const btree_t *tree); + +/** + * btree_get_stats + * populates statistics for the B+tree + * @param tree the B+tree + * @param stats output statistics structure + * @return 0 on success, -1 on failure + */ +int btree_get_stats(const btree_t *tree, btree_stats_t *stats); + +/** + * btree_free + * frees B+tree resources + * @param tree the tree to free + */ +void btree_free(btree_t *tree); + +/** + * btree_set_node_cache + * sets the node cache for faster lookups (optional) + * the cache is not owned by the btree -- caller must manage its lifetime + * @param tree the B+tree + * @param cache the clock cache to use (can be NULL to disable caching) + */ +void btree_set_node_cache(btree_t *tree, clock_cache_t *cache); + +/** + * btree_create_node_cache + * creates a node cache with the proper eviction callback for btree nodes + * caller owns the returned cache and must destroy it + * @param max_bytes maximum cache size in bytes + * @return new cache or NULL on failure + */ +clock_cache_t *btree_create_node_cache(size_t max_bytes); + +/** + * btree_print_tree + * prints tree structure for debugging + * @param tree the B+tree + */ +void btree_print_tree(btree_t *tree); + +/** + * btree_cursor_init + * initializes a cursor positioned before first entry + * @param cursor output pointer to cursor + * @param tree the B+tree + * @return 0 on success, -1 on failure + */ +int btree_cursor_init(btree_cursor_t **cursor, btree_t *tree); + +/** + * btree_cursor_next + * moves cursor to next entry + * @param cursor the cursor + * @return 0 on success, -1 on failure or end + */ +int btree_cursor_next(btree_cursor_t *cursor); + +/** + * btree_cursor_prev + * moves cursor to previous entry + * @param cursor the cursor + * @return 0 on success, -1 on failure or start + */ +int btree_cursor_prev(btree_cursor_t *cursor); + +/** + * btree_cursor_seek + * positions cursor at first key >= target + * @param cursor the cursor + * @param key target key + * @param key_size size of target key + * @return 0 on success, -1 on failure + */ +int btree_cursor_seek(btree_cursor_t *cursor, const uint8_t *key, size_t key_size); + +/** + * btree_cursor_seek_for_prev + * positions cursor at last key <= target + * @param cursor the cursor + * @param key target key + * @param key_size size of target key + * @return 0 on success, -1 on failure + */ +int btree_cursor_seek_for_prev(btree_cursor_t *cursor, const uint8_t *key, size_t key_size); + +/** + * btree_cursor_goto_first + * moves cursor to first entry + * @param cursor the cursor + * @return 0 on success, -1 on failure + */ +int btree_cursor_goto_first(btree_cursor_t *cursor); + +/** + * btree_cursor_goto_last + * moves cursor to last entry + * @param cursor the cursor + * @return 0 on success, -1 on failure + */ +int btree_cursor_goto_last(btree_cursor_t *cursor); + +/** + * btree_cursor_valid + * checks if cursor is at a valid position + * @param cursor the cursor + * @return 1 if valid, 0 if not, -1 on error + */ +int btree_cursor_valid(btree_cursor_t *cursor); + +/** + * btree_cursor_get + * gets entry at current cursor position + * @param cursor the cursor + * @param key output key pointer (do not free, valid until cursor moves) + * @param key_size output key size + * @param value output value pointer (do not free, valid until cursor moves) + * @param value_size output value size + * @param vlog_offset output vlog offset (0 if inline) + * @param seq output sequence number + * @param ttl output time-to-live + * @param deleted output tombstone flag + * @return 0 on success, -1 on failure + */ +int btree_cursor_get(btree_cursor_t *cursor, uint8_t **key, size_t *key_size, uint8_t **value, + size_t *value_size, uint64_t *vlog_offset, uint64_t *seq, int64_t *ttl, + uint8_t *deleted); + +/** + * btree_cursor_has_next + * checks if cursor has next entry + * @param cursor the cursor + * @return 1 if has next, 0 if not, -1 on error + */ +int btree_cursor_has_next(btree_cursor_t *cursor); + +/** + * btree_cursor_has_prev + * checks if cursor has previous entry + * @param cursor the cursor + * @return 1 if has prev, 0 if not, -1 on error + */ +int btree_cursor_has_prev(btree_cursor_t *cursor); + +/** + * btree_cursor_free + * frees cursor resources + * @param cursor the cursor to free + */ +void btree_cursor_free(btree_cursor_t *cursor); + +/** + * btree_node_free + * frees a node and its contents + * @param node the node to free + */ +void btree_node_free(btree_node_t *node); + +/** + * btree_node_read + * reads a node from storage + * @param bm block manager + * @param offset block offset + * @param node output pointer to node + * @return 0 on success, -1 on failure + */ +int btree_node_read(block_manager_t *bm, int64_t offset, btree_node_t **node); + +/** + * btree_node_read_with_compression + * reads a node from storage with decompression support + * @param bm block manager + * @param offset node offset + * @param node output pointer to node + * @param compression_algo compression algorithm (0=none, 2=lz4, 3=zstd, 4=lz4_fast) + * @return 0 on success, -1 on failure + */ +int btree_node_read_with_compression(block_manager_t *bm, int64_t offset, btree_node_t **node, + int compression_algo); + +/** + * btree_format_cache_key_prefix + * write the producer-side cache key prefix for a btree owned by an sstable whose + * cache_key_prefix value is the given uint64. the bytes produced match what + * btree_node_read_cached prepends before the per-node offset, so callers can pass them + * to clock_cache_delete_by_prefix to invalidate every cache entry for one btree. + * + * @param cache_key_prefix the precomputed prefix value (see tidesdb_sstable_t) + * @param out output buffer; must be at least BTREE_CACHE_KEY_SIZE bytes + * @return number of bytes written (no trailing null) + */ +int btree_format_cache_key_prefix(uint64_t cache_key_prefix, char *out); + +#endif /* __BTREE_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/clock_cache.c b/storage/tidesdb/libtidesdb/src/clock_cache.c new file mode 100644 index 0000000000000..cf6af76ce1bde --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/clock_cache.c @@ -0,0 +1,1408 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "clock_cache.h" + +#include "../external/xxhash.h" + +#define CLOCK_CACHE_PARTITION_FULL_THRESHOLD 85 +#define CLOCK_CACHE_REF_BIT 1u +#define CLOCK_CACHE_READER_INC 2u +#define CLOCK_CACHE_REF_MASK ((uint8_t)(~1u & 0xFFu)) +#define CLOCK_CACHE_HAS_READERS(ref) (((ref)&CLOCK_CACHE_REF_MASK) != 0) +/* reader field (bits 1-7) is saturated, all reader bits set == 127 concurrent readers. + * one more READER_INC would carry out of the byte and wrap the field to 0, which would make + * HAS_READERS read false while readers are live -- free_entry would then free under them. */ +#define CLOCK_CACHE_READERS_SATURATED(ref) (((ref)&CLOCK_CACHE_REF_MASK) == CLOCK_CACHE_REF_MASK) +#define CLOCK_CACHE_PAYLOAD_ALIGN 8 /* align payload for safe typed access */ +#define CLOCK_CACHE_ALIGN_UP(x, a) (((x) + ((a)-1)) & ~((size_t)(a)-1)) +#define CLOCK_CACHE_MAX_CPUS 1024 + +/* upper bound on the number of distinct L3 groups detect_l3_groups will track and the + * sysfs path buffer used to read each cpu's shared-cache id */ +#define CLOCK_CACHE_MAX_L3_GROUPS 64 +#define CLOCK_CACHE_SYSFS_PATH_MAX 128 + +/* clock-evict scan limit -- we visit at most occupied * MULT slots per pass with a floor + * of MIN, so sparsely populated partitions do not waste iterations on empty slots */ +#define CLOCK_CACHE_EVICT_SCAN_MULT 3 +#define CLOCK_CACHE_EVICT_SCAN_MIN 64 + +/** + * detect_l3_groups + * detect L3 cache topology by reading sysfs on Linux + * groups CPUs by shared L3 cache (CCX on AMD, monolithic on Intel) + * @param num_cpus number of CPUs to probe + * @param cpu_to_group output array mapping CPU ID -> group ID + * @return number of L3 groups (power of 2), or 1 if detection fails + */ +static int detect_l3_groups(int num_cpus, uint8_t *cpu_to_group) +{ + memset(cpu_to_group, 0, (size_t)num_cpus); + +#if defined(__linux__) + int seen_ids[CLOCK_CACHE_MAX_L3_GROUPS]; + int num_groups = 0; + + for (int cpu = 0; cpu < num_cpus; cpu++) + { + char path[CLOCK_CACHE_SYSFS_PATH_MAX]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cache/index3/id", cpu); + FILE *f = fopen(path, "r"); + if (!f) continue; + + int id = -1; + if (fscanf(f, "%d", &id) != 1) id = -1; + fclose(f); + if (id < 0) continue; + + int g; + for (g = 0; g < num_groups; g++) + { + if (seen_ids[g] == id) break; + } + if (g == num_groups && num_groups < CLOCK_CACHE_MAX_L3_GROUPS) + { + seen_ids[num_groups++] = id; + } + if (cpu < num_cpus) cpu_to_group[cpu] = (uint8_t)g; + } + + if (num_groups > 1) + { + /* we round down to power of 2 for masking */ + int p = 1; + while (p * 2 <= num_groups) p <<= 1; + if (p < num_groups) + { + for (int cpu = 0; cpu < num_cpus; cpu++) + cpu_to_group[cpu] = cpu_to_group[cpu] % (uint8_t)p; + return p; + } + return num_groups; + } +#else + (void)num_cpus; + (void)cpu_to_group; +#endif + return 1; +} + +/** + * get_local_partition + * NUMA-aware partition selection -- routes threads to CCX-local partitions + * on monolithic dies (num_groups=1), this is identical to hash & partition_mask + * @param cache the cache + * @param hash the key hash + * @return partition index local to the calling thread's L3 group + */ +/** re-probe interval for thread migration detection. + * every N calls we re-read the CPU ID to catch OS thread migrations + * across CCX/NUMA boundaries. 4096 keeps the amortized cost negligible + * (~one getcpu every few thousand cache ops) while catching migrations + * within seconds under normal access rates. */ +#define CLOCK_CACHE_GROUP_REPROBE_INTERVAL 4096 + +static inline size_t get_local_partition(const clock_cache_t *cache, uint64_t hash) +{ + if (cache->num_groups <= 1) + { + return (size_t)(hash & cache->partition_mask); + } + + static THREAD_LOCAL int cached_group = -1; + static THREAD_LOCAL unsigned int reprobe_counter = 0; + int group = cached_group; + if (TDB_UNLIKELY(group < 0 || ++reprobe_counter >= CLOCK_CACHE_GROUP_REPROBE_INTERVAL)) + { + reprobe_counter = 0; + const int cpu = tdb_get_cpu_id(); + group = (cpu >= 0 && cpu < cache->max_cpus) ? (int)cache->cpu_to_group[cpu] : 0; + cached_group = group; + } + + const size_t local_idx = (size_t)(hash & cache->local_partition_mask); + return (size_t)group * cache->partitions_per_group + local_idx; +} + +/** + * clock_cache_sum_bytes + * compute total bytes across all partitions by summing per-partition counters. + * avoids contention on a single global atomic in the put/evict hot paths. + * @param cache the cache + * @return total bytes used + */ +static inline size_t clock_cache_sum_bytes(const clock_cache_t *cache) +{ + size_t total = 0; + for (size_t i = 0; i < cache->num_partitions; i++) + { + total += atomic_load_explicit(&cache->partitions[i].bytes_used, memory_order_relaxed); + } + return total; +} + +/** + * entry_size + * compute total entry size + * @param key_len key length + * @param payload_len payload length + * @return total entry size + */ +static inline size_t entry_size(const size_t key_len, const size_t payload_len) +{ + return CLOCK_CACHE_ALIGN_UP(key_len, CLOCK_CACHE_PAYLOAD_ALIGN) + payload_len + + sizeof(clock_cache_entry_t); +} + +/** + * compute_hash + * compute full hash for key + * @param key the key + * @param key_len the key length + * @return hash + */ +static inline uint64_t compute_hash(const char *key, const size_t key_len) +{ + return XXH3_64bits(key, key_len); +} + +/** + * hash_table_insert + * insert slot into hash index with linear probing + * @param partition the partition + * @param hash the hash + * @param slot_idx the slot index + */ +static void hash_table_insert(clock_cache_partition_t *partition, uint64_t hash, + const size_t slot_idx) +{ + clock_cache_entry_t *slot = &partition->slots[slot_idx]; + + /* we store hash in entry for verification */ + atomic_store_explicit(&slot->cached_hash, hash, memory_order_release); + + /* we insert into hash index with linear probing */ + const size_t idx = hash & partition->hash_mask; + const size_t max_probe = (partition->hash_index_size < CLOCK_CACHE_MAX_HASH_PROBE) + ? partition->hash_index_size + : CLOCK_CACHE_MAX_HASH_PROBE; + for (size_t probe = 0; probe < max_probe; probe++) + { + const size_t pos = (idx + probe) & partition->hash_mask; + int32_t expected = -1; + + /* we try to claim this hash index slot + * weak CAS is sufficient since we're in a probe loop -- spurious failure + * just advances to the next probe position */ + if (atomic_compare_exchange_weak(&partition->hash_index[pos], &expected, (int32_t)slot_idx)) + { + return; + } + + /* CAS failed; expected now holds the current value (CAS updates it on failure). + * we check if this slot already points to our entry (reuse case) */ + if (expected == (int32_t)slot_idx) + { + return; /* already indexed */ + } + } +} + +/** + * hash_table_remove + * remove slot from hash index + * @param partition the partition + * @param hash the hash + * @param slot_idx the slot index + */ +static void hash_table_remove(clock_cache_partition_t *partition, const uint64_t hash, + const size_t slot_idx) +{ + const size_t idx = hash & partition->hash_mask; + const size_t max_probe = (partition->hash_index_size < CLOCK_CACHE_MAX_HASH_PROBE) + ? partition->hash_index_size + : CLOCK_CACHE_MAX_HASH_PROBE; + size_t removed_pos = SIZE_MAX; + + /* we find the entry to remove */ + for (size_t probe = 0; probe < max_probe; probe++) + { + const size_t pos = (idx + probe) & partition->hash_mask; + int32_t current = atomic_load_explicit(&partition->hash_index[pos], memory_order_acquire); + + if (current == (int32_t)slot_idx) + { + removed_pos = pos; + break; + } + + if (current == -1) + { + return; /* entry not in index */ + } + } + + if (removed_pos == SIZE_MAX) return; + + /* backward-shift deletion, we shift subsequent entries back to fill the gap + * this preserves the linear probing chain so lookups don't break */ + size_t empty = removed_pos; + for (size_t step = 1; step < max_probe; step++) + { + const size_t candidate = (removed_pos + step) & partition->hash_mask; + int32_t cand_slot = + atomic_load_explicit(&partition->hash_index[candidate], memory_order_acquire); + + if (cand_slot == -1) + { + break; /* end of cluster */ + } + + /* we check if this entry's ideal position is at or before the empty slot + * if so, shift it back to fill the gap */ + uint64_t cand_hash = + atomic_load_explicit(&partition->slots[cand_slot].cached_hash, memory_order_relaxed); + const size_t cand_ideal = cand_hash & partition->hash_mask; + + /* entry belongs at or before the empty slot if moving it would bring it + * closer to (or keep it at) its ideal position. + * with wrapping -- entry is displaced if its ideal position is in the range + * (empty, candidate] on the circular index, i.e., it does not need to pass + * through the empty slot to reach candidate from ideal position */ + int displaced; + if (empty <= candidate) + displaced = (cand_ideal <= empty || cand_ideal > candidate); + else + displaced = (cand_ideal <= empty && cand_ideal > candidate); + + if (displaced) + { + atomic_store_explicit(&partition->hash_index[empty], cand_slot, memory_order_release); + empty = candidate; + } + } + + /* we clear the final empty position */ + atomic_store_explicit(&partition->hash_index[empty], -1, memory_order_release); +} + +/** + * try_match_entry + * @param entry the entry + * @param key the key + * @param key_len the key length + * @param target_hash the target hash + * @return the entry or NULL if not found + */ +/* acquire a reader ref, refusing at saturation (see CLOCK_CACHE_READERS_SATURATED). + * returns 1 with a ref held, 0 if already at max readers */ +static inline int cc_try_pin_reader(clock_cache_entry_t *entry) +{ + uint8_t cur = atomic_load_explicit(&entry->ref_bit, memory_order_relaxed); + for (;;) + { + if (CLOCK_CACHE_READERS_SATURATED(cur)) return 0; + const uint8_t desired = (uint8_t)(cur + CLOCK_CACHE_READER_INC); + if (atomic_compare_exchange_weak_explicit(&entry->ref_bit, &cur, desired, + memory_order_acq_rel, memory_order_relaxed)) + return 1; + /* cur was reloaded with the current value on CAS failure -- retry */ + } +} + +static clock_cache_entry_t *try_match_entry(clock_cache_entry_t *entry, const char *key, + size_t key_len, uint64_t target_hash) +{ + uint8_t state = atomic_load_explicit(&entry->state, memory_order_relaxed); + if (state != ENTRY_VALID) return NULL; + + uint64_t entry_hash = atomic_load_explicit(&entry->cached_hash, memory_order_relaxed); + if (entry_hash != target_hash) return NULL; + + size_t entry_key_len = atomic_load_explicit(&entry->key_len, memory_order_relaxed); + if (entry_key_len != key_len) return NULL; + + if (!cc_try_pin_reader(entry)) return NULL; + + /* we re-validate state after acquiring ref (entry may have been evicted between + * our pre-checks and the ref acquisition) */ + state = atomic_load_explicit(&entry->state, memory_order_acquire); + if (state != ENTRY_VALID) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + return NULL; + } + + char *entry_key = atomic_load_explicit(&entry->key, memory_order_acquire); + if (!entry_key) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + return NULL; + } + + if (memcmp(entry_key, key, key_len) != 0) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + return NULL; + } + + /* match! return with reader ref HELD -- caller must release via + * atomic_fetch_sub(ref_bit, CLOCK_CACHE_READER_INC) when done */ + return entry; +} + +static clock_cache_entry_t *find_entry_with_hash(clock_cache_partition_t *partition, + const char *key, const size_t key_len, + const uint64_t target_hash) +{ + /* we cache immutable struct fields in registers to prevent reloads across + * atomic barriers in try_match_entry (acq_rel on ref_bit acts as compiler barrier, + * forcing the compiler to reload partition->hash_mask etc. from memory each iteration) */ + const size_t hash_mask = partition->hash_mask; + _Atomic(int32_t) *const hash_index = partition->hash_index; + clock_cache_entry_t *const slots = partition->slots; + + const size_t idx = target_hash & hash_mask; + const size_t max_probe = (partition->hash_index_size < CLOCK_CACHE_MAX_HASH_PROBE) + ? partition->hash_index_size + : CLOCK_CACHE_MAX_HASH_PROBE; + /* we prefetch the first hash index entry before the loop to warm the cache line */ + PREFETCH_READ(&hash_index[idx]); + + for (size_t probe = 0; probe < max_probe; probe++) + { + const size_t pos = (idx + probe) & hash_mask; + int32_t slot_idx = atomic_load_explicit(&hash_index[pos], memory_order_relaxed); + + if (slot_idx == -1) + { + /* empty slot in index, entry not found */ + return NULL; + } + + /* we prefetch slot data + next hash index entry simultaneously + * this gives memory subsystem time to warm both cache lines + * before the next iteration's hash_index load and this iteration's try_match */ + PREFETCH_READ(&slots[slot_idx]); + if (probe + 1 < max_probe) + { + const size_t next_pos = (idx + probe + 1) & hash_mask; + PREFETCH_READ(&hash_index[next_pos]); + } + + clock_cache_entry_t *entry = &slots[slot_idx]; + clock_cache_entry_t *match = try_match_entry(entry, key, key_len, target_hash); + if (match) return match; + } + + return NULL; +} + +/** + * free_entry + * free entry contents -- lock-free with atomic state transitions + * @param cache the cache + * @param partition the partition + * @param entry the entry + */ +static void free_entry(clock_cache_t *cache, clock_cache_partition_t *partition, + clock_cache_entry_t *entry) +{ + /* we try to claim entry for deletion using CAS */ + uint8_t expected = ENTRY_VALID; + if (!atomic_compare_exchange_strong(&entry->state, &expected, ENTRY_DELETING)) + { + /* someone else is deleting or entry is already empty */ + return; + } + + char *key = atomic_load_explicit(&entry->key, memory_order_acquire); + void *payload = atomic_load_explicit(&entry->payload, memory_order_acquire); + const size_t plen = atomic_load_explicit(&entry->payload_len, memory_order_acquire); + const size_t klen = atomic_load_explicit(&entry->key_len, memory_order_acquire); + + if (!key || !payload) + { + /* invalid entry, just mark as empty */ + atomic_store_explicit(&entry->state, ENTRY_EMPTY, memory_order_release); + return; + } + + /* we check if entry is being read (upper bits indicate active readers) */ + uint8_t ref = atomic_load_explicit(&entry->ref_bit, memory_order_acquire); + if (CLOCK_CACHE_HAS_READERS(ref)) + { + /* entry is being read by active readers, revert state and abort */ + atomic_store_explicit(&entry->state, ENTRY_VALID, memory_order_release); + return; + } + + /* we mark hash entry as deleted (tombstone) -- but keep back-pointer for reuse + * we use cached hash to avoid redundant XXH3 recomputation */ + const uint64_t hash = atomic_load_explicit(&entry->cached_hash, memory_order_relaxed); + const size_t slot_idx = entry - partition->slots; + hash_table_remove(partition, hash, slot_idx); + + atomic_store_explicit(&entry->key, NULL, memory_order_release); + atomic_store_explicit(&entry->payload, NULL, memory_order_release); + + /* we must re-check ref_bit after clearing pointers, a reader may have snuck in + * between our first check and clearing pointers + * the release stores above + this acquire load form a release-acquire pair */ + ref = atomic_load_explicit(&entry->ref_bit, memory_order_acquire); + if (CLOCK_CACHE_HAS_READERS(ref)) + { + /* a reader incremented ref_bit after we started deleting + * restore pointers and revert state, we must let the reader finish */ + atomic_store_explicit(&entry->key, key, memory_order_release); + atomic_store_explicit(&entry->payload, payload, memory_order_release); + atomic_store_explicit(&entry->state, ENTRY_VALID, memory_order_release); + hash_table_insert(partition, hash, slot_idx); + return; + } + + if (cache->evict_callback) + { + cache->evict_callback(payload, plen); + } + + /* payload is embedded in same allocation as key -- single free */ + free(key); + atomic_store_explicit(&entry->key_len, 0, memory_order_release); + atomic_store_explicit(&entry->payload_len, 0, memory_order_release); + atomic_store_explicit(&entry->ref_bit, 0, memory_order_release); + + const size_t ext_bytes = atomic_load_explicit(&entry->external_bytes, memory_order_relaxed); + + const size_t freed_bytes = entry_size(klen, plen) + ext_bytes; + atomic_fetch_sub_explicit(&partition->occupied_count, 1, memory_order_relaxed); + atomic_fetch_sub_explicit(&partition->bytes_used, freed_bytes, memory_order_relaxed); + + /* we transition to empty state */ + atomic_store_explicit(&entry->state, ENTRY_EMPTY, memory_order_release); +} + +/** + * evict_for_space + * CLOCK eviction that targets VALID entries to free memory. + * skips EMPTY slots (unlike clock_evict which returns them). + * uses two passes -- first pass clears ref_bits, second pass evicts. + * @param cache the cache + * @param partition the partition + * @return 1 if an entry was evicted, 0 if no evictable entry found + */ +static int evict_for_space(clock_cache_t *cache, clock_cache_partition_t *partition) +{ + const size_t slots_mask = partition->slots_mask; + const size_t start = + atomic_fetch_add_explicit(&partition->clock_hand, 1, memory_order_relaxed) & slots_mask; + + /* we limit scan distance based on occupied count rather than total slots. + * when the partition is sparsely populated (e.g., 128 entries in 8192 slots), + * scanning all slots wastes 98%+ of iterations on EMPTY slots. + * we scan at most occupied_count * CLOCK_CACHE_EVICT_SCAN_MULT slots (gives high + * probability of finding a victim even with clustering) with a minimum of + * CLOCK_CACHE_EVICT_SCAN_MIN. */ + const size_t occupied = atomic_load_explicit(&partition->occupied_count, memory_order_relaxed); + size_t scan_limit = occupied * CLOCK_CACHE_EVICT_SCAN_MULT; + if (scan_limit < CLOCK_CACHE_EVICT_SCAN_MIN) scan_limit = CLOCK_CACHE_EVICT_SCAN_MIN; + if (scan_limit > partition->num_slots) scan_limit = partition->num_slots; + + /* pass 0 clears ref_bits, pass 1 evicts entries with ref_bit=0 */ + for (int pass = 0; pass < 2; pass++) + { + for (size_t i = 0; i < scan_limit; i++) + { + const size_t hand = (start + i) & slots_mask; + clock_cache_entry_t *entry = &partition->slots[hand]; + + const uint8_t state = atomic_load_explicit(&entry->state, memory_order_acquire); + if (state != ENTRY_VALID) continue; + + const uint8_t ref = atomic_load_explicit(&entry->ref_bit, memory_order_acquire); + if (CLOCK_CACHE_HAS_READERS(ref)) continue; + + if ((ref & CLOCK_CACHE_REF_BIT) == 0) + { + /* no ref_bit, no readers -- evict */ + free_entry(cache, partition, entry); + atomic_store_explicit(&partition->clock_hand, hand + 1, memory_order_relaxed); + return 1; + } + + /* ref_bit set -- clear it (second chance), will evict on next pass */ + atomic_fetch_and_explicit(&entry->ref_bit, CLOCK_CACHE_REF_MASK, memory_order_relaxed); + } + } + + return 0; +} + +/** + * clock_evict + * CLOCK second-chance eviction -- finds or frees a slot for new entry insertion + * @param cache the cache + * @param partition the partition + * @return slot index of an available (empty or just-evicted) entry + */ +static size_t clock_evict(clock_cache_t *cache, clock_cache_partition_t *partition) +{ + size_t iterations = 0; + const size_t max_iterations = partition->num_slots; + + /* we start from thread-local position to reduce contention on clock_hand */ + static THREAD_LOCAL size_t thread_hand = 0; + if (thread_hand == 0) + { + thread_hand = (size_t)TDB_THREAD_ID(); + if (thread_hand == 0) thread_hand = 1; /* we ensure non-zero */ + } + const size_t slots_mask = partition->slots_mask; + const size_t start_pos = thread_hand & slots_mask; + + while (iterations < max_iterations) + { + /* we use local counter with occasional sync to global clock_hand */ + const size_t hand = (start_pos + iterations) & slots_mask; + clock_cache_entry_t *entry = &partition->slots[hand]; + + /* we prefetch 2 entries ahead to overlap memory latency with eviction logic */ + const size_t pf1 = (hand + 1) & slots_mask; + const size_t pf2 = (hand + 2) & slots_mask; + PREFETCH_READ(&partition->slots[pf1]); + PREFETCH_READ(&partition->slots[pf2]); + + /* we check state atomically */ + uint8_t state = atomic_load_explicit(&entry->state, memory_order_acquire); + + if (state == ENTRY_EMPTY) + { + /* found empty slot -- we update thread position for next time */ + thread_hand = hand + 1; + return hand; + } + + if (state != ENTRY_VALID) + { + iterations++; + continue; + } + + /* we check reference bit and active readers */ + uint8_t ref = atomic_load_explicit(&entry->ref_bit, memory_order_acquire); + if (CLOCK_CACHE_HAS_READERS(ref)) + { + if (ref & CLOCK_CACHE_REF_BIT) + { + atomic_fetch_and_explicit(&entry->ref_bit, CLOCK_CACHE_REF_MASK, + memory_order_relaxed); + } + iterations++; + continue; + } + + if ((ref & CLOCK_CACHE_REF_BIT) == 0) + { + /* found victim -- try to evict */ + PREFETCH_WRITE(entry); + free_entry(cache, partition, entry); + + /* we update thread position for next time */ + thread_hand = hand + 1; + return hand; + } + + atomic_fetch_and_explicit(&entry->ref_bit, CLOCK_CACHE_REF_MASK, memory_order_relaxed); + + iterations++; + } + + /* we try to evict at current position as a fallback*/ + size_t hand = atomic_load_explicit(&partition->clock_hand, memory_order_acquire) & slots_mask; + clock_cache_entry_t *entry = &partition->slots[hand]; + PREFETCH_WRITE(entry); + uint8_t state = atomic_load_explicit(&entry->state, memory_order_acquire); + + if (state == ENTRY_VALID) + { + free_entry(cache, partition, entry); + } + + return hand; +} + +/** + * ensure_space + * ensure space in partition + * @param cache the cache + * @param partition the partition + * @param required_bytes the required bytes + * @return 0 on success, -1 on failure + */ +static int ensure_space(clock_cache_t *cache, clock_cache_partition_t *partition, + const size_t required_bytes) +{ + const size_t occupied = atomic_load_explicit(&partition->occupied_count, memory_order_relaxed); + + /* we check global byte budget (not per-partition) to avoid premature eviction + * when hash distribution is uneven. a hot partition can use more than its "fair share" + * as long as the total cache stays within budget. + * we sum per-partition bytes_used instead of reading a single global atomic + * to eliminate contention on the put/evict hot paths at high core counts. */ + const size_t global_bytes = clock_cache_sum_bytes(cache); + if (global_bytes + required_bytes <= cache->max_bytes && occupied < partition->evict_threshold) + { + return 0; + } + + /* byte-based eviction -- we enforce global byte budget via local eviction. + * we evict from this partition to reduce global pressure. */ + if (cache->max_bytes > 0 && global_bytes + required_bytes > cache->max_bytes) + { + size_t cur_global = global_bytes; + size_t evict_rounds = 0; + const size_t max_evictions = occupied; + while (cur_global + required_bytes > cache->max_bytes && evict_rounds < max_evictions) + { + if (!evict_for_space(cache, partition)) break; + cur_global = clock_cache_sum_bytes(cache); + evict_rounds++; + } + + /* if local partition eviction wasn't enough, try other partitions. + * this handles the case where entries are spread across many partitions + * and a single partition can't free enough to meet the global byte budget + * (common with large external_bytes like btree nodes). */ + if (cur_global + required_bytes > cache->max_bytes) + { + const size_t local_idx = (size_t)(partition - cache->partitions); + for (size_t p = 1; p < cache->num_partitions; p++) + { + const size_t other_idx = (local_idx + p) & (cache->num_partitions - 1); + clock_cache_partition_t *other = &cache->partitions[other_idx]; + const size_t other_occ = + atomic_load_explicit(&other->occupied_count, memory_order_relaxed); + if (other_occ == 0) continue; + + size_t rounds = 0; + while (rounds < other_occ) + { + if (!evict_for_space(cache, other)) break; + rounds++; + const size_t now = clock_cache_sum_bytes(cache); + if (now + required_bytes <= cache->max_bytes) goto eviction_done; + } + } + eviction_done:; + } + } + + /* slot-count-based eviction -- prevent hash table overload */ + if (occupied >= partition->evict_threshold) + { + clock_evict(cache, partition); + } + + return 0; +} + +void clock_cache_compute_config(const size_t max_bytes, cache_config_t *config) +{ + if (!config) return; + + const int num_cpus = tdb_get_cpu_count(); + + size_t num_partitions = (size_t)num_cpus * CLOCK_CACHE_PARTITIONS_PER_CPU; + if (num_partitions < CLOCK_CACHE_MIN_PARTITIONS) num_partitions = CLOCK_CACHE_MIN_PARTITIONS; + if (num_partitions > CLOCK_CACHE_MAX_PARTITIONS) num_partitions = CLOCK_CACHE_MAX_PARTITIONS; + + /* we round up to next power of 2 for efficient masking */ + size_t p = 1; + while (p < num_partitions) p <<= 1; + num_partitions = p; + + /* slot count is sized for hash table efficiency (low load factor), not for byte budget. + * when caller specifies avg_entry_size (e.g., btree 64KB nodes), use it to avoid + * creating vastly more slots than entries that will fit in the byte budget. + * otherwise use a small default so that many small entries probe efficiently. */ + const size_t avg_entry_size = + (config->avg_entry_size > 0) ? config->avg_entry_size : CLOCK_CACHE_AVG_ENTRY_SIZE; + size_t total_entries = max_bytes / avg_entry_size; + if (total_entries < num_partitions) total_entries = num_partitions; + + /* we distribute entries across partitions */ + size_t slots_per_partition = total_entries / num_partitions; + + /* we clamp to reasonable range -- 64-8192 slots per partition */ + if (slots_per_partition < CLOCK_CACHE_MIN_SLOTS_PER_PARTITION) + slots_per_partition = CLOCK_CACHE_MIN_SLOTS_PER_PARTITION; + if (slots_per_partition > CLOCK_CACHE_MAX_SLOTS_PER_PARTITION) + slots_per_partition = CLOCK_CACHE_MAX_SLOTS_PER_PARTITION; + + /* we round up to next power of 2 for better memory alignment */ + size_t s = CLOCK_CACHE_MIN_SLOTS_PER_PARTITION; + while (s < slots_per_partition) s <<= 1; + slots_per_partition = s; + + config->max_bytes = max_bytes; + config->num_partitions = num_partitions; + config->slots_per_partition = slots_per_partition; + config->evict_callback = NULL; /* no callback by default */ +} + +clock_cache_t *clock_cache_create(const cache_config_t *config) +{ + if (!config || config->num_partitions == 0 || config->slots_per_partition == 0) + { + return NULL; + } + + clock_cache_t *cache = (clock_cache_t *)calloc(1, sizeof(clock_cache_t)); + if (!cache) return NULL; + + cache->num_partitions = config->num_partitions; + cache->max_bytes = config->max_bytes; + cache->partition_mask = config->num_partitions - 1; /* assumes power of 2 */ + cache->evict_callback = config->evict_callback; /* store eviction callback */ + atomic_store_explicit(&cache->total_bytes, 0, memory_order_relaxed); + atomic_store_explicit(&cache->hits, 0, memory_order_relaxed); + atomic_store_explicit(&cache->misses, 0, memory_order_relaxed); + atomic_store_explicit(&cache->shutdown, 0, memory_order_relaxed); + + /** we detect L3/CCX topology for NUMA-aware partition routing */ + cache->max_cpus = tdb_get_cpu_count(); + if (cache->max_cpus > CLOCK_CACHE_MAX_CPUS) cache->max_cpus = CLOCK_CACHE_MAX_CPUS; + cache->cpu_to_group = (uint8_t *)calloc((size_t)cache->max_cpus, sizeof(uint8_t)); + if (!cache->cpu_to_group) + { + free(cache); + return NULL; + } + + cache->num_groups = (size_t)detect_l3_groups(cache->max_cpus, cache->cpu_to_group); + if (cache->num_groups > config->num_partitions) cache->num_groups = 1; + cache->partitions_per_group = config->num_partitions / cache->num_groups; + cache->local_partition_mask = cache->partitions_per_group - 1; + + cache->partitions = + (clock_cache_partition_t *)calloc(config->num_partitions, sizeof(clock_cache_partition_t)); + if (!cache->partitions) + { + free(cache->cpu_to_group); + free(cache); + return NULL; + } + + for (size_t i = 0; i < config->num_partitions; i++) + { + clock_cache_partition_t *partition = &cache->partitions[i]; + partition->num_slots = config->slots_per_partition; + partition->slots_mask = config->slots_per_partition - 1; + partition->evict_threshold = + (config->slots_per_partition * CLOCK_CACHE_PARTITION_FULL_THRESHOLD) / 100; + atomic_store_explicit(&partition->clock_hand, 0, memory_order_relaxed); + atomic_store_explicit(&partition->occupied_count, 0, memory_order_relaxed); + atomic_store_explicit(&partition->bytes_used, 0, memory_order_relaxed); + atomic_store_explicit(&partition->hits, 0, memory_order_relaxed); + atomic_store_explicit(&partition->misses, 0, memory_order_relaxed); + + /* we calculate hash index size (2x slots for low collision rate) */ + partition->hash_index_size = + (config->slots_per_partition * CLOCK_CACHE_HASH_INDEX_MULTIPLIER_NUM) / + CLOCK_CACHE_HASH_INDEX_MULTIPLIER_DEN; + /* we round up to next power of 2 */ + size_t size = 1; + while (size < partition->hash_index_size) size <<= 1; + partition->hash_index_size = size; + partition->hash_mask = size - 1; + + partition->slots = + (clock_cache_entry_t *)calloc(config->slots_per_partition, sizeof(clock_cache_entry_t)); + if (!partition->slots) + { + for (size_t j = 0; j < i; j++) + { + free((void *)cache->partitions[j].hash_index); + free(cache->partitions[j].slots); + } + free(cache->partitions); + free(cache->cpu_to_group); + free(cache); + return NULL; + } + + partition->hash_index = + (_Atomic(int32_t) *)calloc(partition->hash_index_size, sizeof(_Atomic(int32_t))); + if (!partition->hash_index) + { + free(partition->slots); + for (size_t j = 0; j < i; j++) + { + free((void *)cache->partitions[j].hash_index); + free(cache->partitions[j].slots); + } + free(cache->partitions); + free(cache->cpu_to_group); + free(cache); + return NULL; + } + + /* we initialize hash index to -1 (which is empty) */ + for (size_t j = 0; j < partition->hash_index_size; j++) + { + atomic_store_explicit(&partition->hash_index[j], -1, memory_order_relaxed); + } + + /* we initialize all entry states to EMPTY */ + for (size_t j = 0; j < partition->num_slots; j++) + { + atomic_store_explicit(&partition->slots[j].state, ENTRY_EMPTY, memory_order_relaxed); + atomic_store_explicit(&partition->slots[j].key, NULL, memory_order_relaxed); + atomic_store_explicit(&partition->slots[j].payload, NULL, memory_order_relaxed); + atomic_store_explicit(&partition->slots[j].key_len, 0, memory_order_relaxed); + atomic_store_explicit(&partition->slots[j].payload_len, 0, memory_order_relaxed); + atomic_store_explicit(&partition->slots[j].ref_bit, 0, memory_order_relaxed); + atomic_store_explicit(&partition->slots[j].cached_hash, 0, memory_order_relaxed); + } + } + + return cache; +} + +void clock_cache_destroy(clock_cache_t *cache) +{ + if (!cache) return; + + atomic_store_explicit(&cache->shutdown, 1, memory_order_release); + + /* mem fence, ensure all threads see shutdown flag */ + atomic_thread_fence(memory_order_seq_cst); + + for (size_t i = 0; i < cache->num_partitions; i++) + { + clock_cache_partition_t *partition = &cache->partitions[i]; + + /* we mark all entries as deleting first to stop new accesses */ + for (size_t j = 0; j < partition->num_slots; j++) + { + uint8_t state = atomic_load_explicit(&partition->slots[j].state, memory_order_acquire); + if (state == ENTRY_VALID || state == ENTRY_WRITING) + { + atomic_store_explicit(&partition->slots[j].state, ENTRY_DELETING, + memory_order_release); + } + } + + /* mem fence -- ensure all readers see DELETING state */ + atomic_thread_fence(memory_order_seq_cst); + + for (size_t j = 0; j < partition->num_slots; j++) + { + char *key = atomic_load_explicit(&partition->slots[j].key, memory_order_acquire); + void *payload = + atomic_load_explicit(&partition->slots[j].payload, memory_order_acquire); + const size_t payload_len = + atomic_load_explicit(&partition->slots[j].payload_len, memory_order_acquire); + + if (payload && cache->evict_callback) + { + cache->evict_callback(payload, payload_len); + } + + /* payload is embedded in same allocation as key -- single free */ + if (key) free(key); + } + + free((void *)partition->hash_index); + free(partition->slots); + } + + free(cache->partitions); + free(cache->cpu_to_group); + free(cache); +} + +int clock_cache_put(clock_cache_t *cache, const char *key, size_t key_len, const void *payload, + size_t payload_len, size_t external_bytes) +{ + if (!cache || !key || key_len == 0 || !payload) return -1; + + if (atomic_load_explicit(&cache->shutdown, memory_order_acquire)) return -1; + + const uint64_t hash = compute_hash(key, key_len); + const size_t partition_idx = get_local_partition(cache, hash); + clock_cache_partition_t *partition = &cache->partitions[partition_idx]; + const size_t entry_bytes = entry_size(key_len, payload_len) + external_bytes; + + /* we try to find and invalidate existing entry (best-effort update) */ + clock_cache_entry_t *old_entry = find_entry_with_hash(partition, key, key_len, hash); + if (old_entry) + { + /* we release reader ref before free_entry (which checks for active readers) */ + atomic_fetch_sub_explicit(&old_entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_acq_rel); + free_entry(cache, partition, old_entry); + } + + /* we always ensure space to enforce max_bytes limit */ + ensure_space(cache, partition, entry_bytes); + + clock_cache_entry_t *entry = NULL; + size_t slot_idx = 0; + const int max_retries = CLOCK_CACHE_MAX_PUT_RETRIES; + + for (int retry = 0; retry < max_retries; retry++) + { + slot_idx = clock_evict(cache, partition); + entry = &partition->slots[slot_idx]; + PREFETCH_WRITE(entry); + + /* we try to claim slot with CAS, EMPTY --> WRITING */ + uint8_t expected = ENTRY_EMPTY; + if (atomic_compare_exchange_strong(&entry->state, &expected, ENTRY_WRITING)) + { + /* got it */ + break; + } + + /* someone else claimed it, try again */ + entry = NULL; + } + + if (!entry) + { + /* failed to claim slot after retries */ + return -1; + } + + /* we own the slot now, allocate key + payload in single allocation + * payload is aligned to CLOCK_CACHE_PAYLOAD_ALIGN for safe typed access + * this halves malloc calls and improves data locality */ + const size_t aligned_key_len = CLOCK_CACHE_ALIGN_UP(key_len, CLOCK_CACHE_PAYLOAD_ALIGN); + char *new_buf = (char *)malloc(aligned_key_len + payload_len); + if (!new_buf) + { + atomic_store_explicit(&entry->state, ENTRY_EMPTY, memory_order_release); + return -1; + } + + char *new_key = new_buf; + void *new_payload = new_buf + aligned_key_len; + memcpy(new_key, key, key_len); + memcpy(new_payload, payload, payload_len); + + atomic_store_explicit(&entry->key, new_key, memory_order_release); + atomic_store_explicit(&entry->payload, new_payload, memory_order_release); + atomic_store_explicit(&entry->key_len, key_len, memory_order_release); + atomic_store_explicit(&entry->payload_len, payload_len, memory_order_release); + atomic_store_explicit(&entry->ref_bit, CLOCK_CACHE_REF_BIT, memory_order_release); + atomic_store_explicit(&entry->external_bytes, external_bytes, memory_order_release); + + /* we transition to valid, entry is now visible */ + atomic_store_explicit(&entry->state, ENTRY_VALID, memory_order_release); + + atomic_fetch_add_explicit(&partition->occupied_count, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&partition->bytes_used, entry_bytes, memory_order_relaxed); + + hash_table_insert(partition, hash, slot_idx); + + return 0; +} + +int clock_cache_put_new(clock_cache_t *cache, const char *key, size_t key_len, const void *payload, + size_t payload_len, size_t external_bytes) +{ + if (!cache || !key || key_len == 0 || !payload) return -1; + + if (atomic_load_explicit(&cache->shutdown, memory_order_acquire)) return -1; + + const uint64_t hash = compute_hash(key, key_len); + const size_t partition_idx = get_local_partition(cache, hash); + clock_cache_partition_t *partition = &cache->partitions[partition_idx]; + const size_t entry_bytes = entry_size(key_len, payload_len) + external_bytes; + + /* skip find_entry_with_hash -- caller guarantees key is not in cache + * this saves a full hash table probe on the cache-miss-then-populate path */ + + ensure_space(cache, partition, entry_bytes); + + clock_cache_entry_t *entry = NULL; + size_t slot_idx = 0; + const int max_retries = CLOCK_CACHE_MAX_PUT_RETRIES; + + for (int retry = 0; retry < max_retries; retry++) + { + slot_idx = clock_evict(cache, partition); + entry = &partition->slots[slot_idx]; + PREFETCH_WRITE(entry); + + uint8_t expected = ENTRY_EMPTY; + if (atomic_compare_exchange_strong(&entry->state, &expected, ENTRY_WRITING)) + { + break; + } + + entry = NULL; + } + + if (!entry) + { + return -1; + } + + const size_t aligned_key_len = CLOCK_CACHE_ALIGN_UP(key_len, CLOCK_CACHE_PAYLOAD_ALIGN); + char *new_buf = (char *)malloc(aligned_key_len + payload_len); + if (!new_buf) + { + atomic_store_explicit(&entry->state, ENTRY_EMPTY, memory_order_release); + return -1; + } + + char *new_key = new_buf; + void *new_payload = new_buf + aligned_key_len; + memcpy(new_key, key, key_len); + memcpy(new_payload, payload, payload_len); + + atomic_store_explicit(&entry->key, new_key, memory_order_release); + atomic_store_explicit(&entry->payload, new_payload, memory_order_release); + atomic_store_explicit(&entry->key_len, key_len, memory_order_release); + atomic_store_explicit(&entry->payload_len, payload_len, memory_order_release); + atomic_store_explicit(&entry->ref_bit, CLOCK_CACHE_REF_BIT, memory_order_release); + atomic_store_explicit(&entry->external_bytes, external_bytes, memory_order_release); + + atomic_store_explicit(&entry->state, ENTRY_VALID, memory_order_release); + + atomic_fetch_add_explicit(&partition->occupied_count, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&partition->bytes_used, entry_bytes, memory_order_relaxed); + + hash_table_insert(partition, hash, slot_idx); + + return 0; +} + +uint8_t *clock_cache_get(clock_cache_t *cache, const char *key, const size_t key_len, + size_t *payload_len) +{ + if (!cache || !key || key_len == 0) return NULL; + + if (atomic_load_explicit(&cache->shutdown, memory_order_acquire)) return NULL; + + const uint64_t hash = compute_hash(key, key_len); + const size_t partition_idx = get_local_partition(cache, hash); + clock_cache_partition_t *partition = &cache->partitions[partition_idx]; + + /* find_entry_with_hash returns entry with reader ref HELD (from try_match_entry) */ + clock_cache_entry_t *entry = find_entry_with_hash(partition, key, key_len, hash); + + if (!entry) + { + atomic_fetch_add_explicit(&partition->misses, 1, memory_order_relaxed); + return NULL; + } + + /* reader ref is held -- entry is protected from eviction */ + uint8_t *entry_payload = atomic_load_explicit(&entry->payload, memory_order_acquire); + size_t entry_payload_len = atomic_load_explicit(&entry->payload_len, memory_order_acquire); + + if (!entry_payload || entry_payload_len == 0) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + return NULL; + } + + uint8_t *result = (uint8_t *)malloc(entry_payload_len); + if (!result) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + return NULL; + } + + memcpy(result, entry_payload, entry_payload_len); + + /* we release reader ref and conditionally mark as recently used + * combining two atomic RMWs into one when ref bit is already set (hot entries) */ + uint8_t old_ref = + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + if (!(old_ref & CLOCK_CACHE_REF_BIT)) + { + atomic_fetch_or_explicit(&entry->ref_bit, CLOCK_CACHE_REF_BIT, memory_order_relaxed); + } + + if (payload_len) *payload_len = entry_payload_len; + + atomic_fetch_add_explicit(&partition->hits, 1, memory_order_relaxed); + return result; +} + +const uint8_t *clock_cache_get_zero_copy(clock_cache_t *cache, const char *key, + const size_t key_len, size_t *payload_len, + clock_cache_entry_t **entry_out) +{ + if (!cache || !key || key_len == 0) return NULL; + + if (atomic_load_explicit(&cache->shutdown, memory_order_acquire)) return NULL; + + const uint64_t hash = compute_hash(key, key_len); + const size_t partition_idx = get_local_partition(cache, hash); + clock_cache_partition_t *partition = &cache->partitions[partition_idx]; + + /* find_entry_with_hash returns entry with reader ref HELD (from try_match_entry) */ + clock_cache_entry_t *entry = find_entry_with_hash(partition, key, key_len, hash); + + if (!entry) + { + atomic_fetch_add_explicit(&partition->misses, 1, memory_order_relaxed); + return NULL; + } + + /* reader ref is held -- entry is protected from eviction */ + uint8_t *entry_payload = atomic_load_explicit(&entry->payload, memory_order_acquire); + size_t entry_payload_len = atomic_load_explicit(&entry->payload_len, memory_order_acquire); + + if (!entry_payload || entry_payload_len == 0) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + return NULL; + } + + if (payload_len) *payload_len = entry_payload_len; + if (entry_out) *entry_out = entry; + + /* we conditionally mark as recently used -- skip atomic RMW when ref bit is already set + * (hot entries); caller releases ref via clock_cache_release() */ + uint8_t cur_ref = atomic_load_explicit(&entry->ref_bit, memory_order_relaxed); + if (!(cur_ref & CLOCK_CACHE_REF_BIT)) + { + atomic_fetch_or_explicit(&entry->ref_bit, CLOCK_CACHE_REF_BIT, memory_order_relaxed); + } + + atomic_fetch_add_explicit(&partition->hits, 1, memory_order_relaxed); + return entry_payload; +} + +void clock_cache_release(clock_cache_entry_t *entry) +{ + if (!entry) return; + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); +} + +int clock_cache_delete(clock_cache_t *cache, const char *key, const size_t key_len) +{ + if (!cache || !key || key_len == 0) return -1; + + if (atomic_load_explicit(&cache->shutdown, memory_order_acquire)) return -1; + + const uint64_t hash = compute_hash(key, key_len); + const size_t partition_idx = get_local_partition(cache, hash); + clock_cache_partition_t *partition = &cache->partitions[partition_idx]; + + clock_cache_entry_t *entry = find_entry_with_hash(partition, key, key_len, hash); + + if (!entry) + { + return -1; + } + + /* we release reader ref before free_entry (which checks for active readers) */ + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, memory_order_acq_rel); + free_entry(cache, partition, entry); + + return 0; +} + +void clock_cache_clear(clock_cache_t *cache) +{ + if (!cache) return; + + for (size_t i = 0; i < cache->num_partitions; i++) + { + clock_cache_partition_t *partition = &cache->partitions[i]; + + for (size_t j = 0; j < partition->num_slots; j++) + { + uint8_t state = atomic_load_explicit(&partition->slots[j].state, memory_order_acquire); + if (state == ENTRY_VALID) + { + free_entry(cache, partition, &partition->slots[j]); + } + } + } + + /* we reset per-partition byte counters (may have residual from reader-held entries) */ + for (size_t i = 0; i < cache->num_partitions; i++) + { + atomic_store_explicit(&cache->partitions[i].bytes_used, 0, memory_order_relaxed); + } +} + +void clock_cache_get_stats(clock_cache_t *cache, clock_cache_stats_t *stats) +{ + if (!cache || !stats) return; + + /* we use tracked per-partition counters instead of scanning all slots + * this is O(num_partitions) instead of O(total_slots) */ + size_t total_bytes = 0; + size_t total_entries = 0; + uint64_t total_hits = 0; + uint64_t total_misses = 0; + for (size_t i = 0; i < cache->num_partitions; i++) + { + total_bytes += atomic_load_explicit(&cache->partitions[i].bytes_used, memory_order_relaxed); + total_entries += + atomic_load_explicit(&cache->partitions[i].occupied_count, memory_order_relaxed); + total_hits += atomic_load_explicit(&cache->partitions[i].hits, memory_order_relaxed); + total_misses += atomic_load_explicit(&cache->partitions[i].misses, memory_order_relaxed); + } + + stats->total_bytes = total_bytes; + stats->total_entries = total_entries; + stats->hits = total_hits; + stats->misses = total_misses; + stats->num_partitions = cache->num_partitions; + + const uint64_t total_accesses = stats->hits + stats->misses; + stats->hit_rate = (total_accesses > 0) ? ((double)stats->hits / (double)total_accesses) : 0.0; +} + +size_t clock_cache_delete_by_prefix(clock_cache_t *cache, const char *prefix, + const size_t prefix_len) +{ + if (!cache || !prefix || prefix_len == 0) return 0; + + size_t count = 0; + + for (size_t p = 0; p < cache->num_partitions; p++) + { + clock_cache_partition_t *partition = &cache->partitions[p]; + + for (size_t i = 0; i < partition->num_slots; i++) + { + clock_cache_entry_t *entry = &partition->slots[i]; + + uint8_t state = atomic_load_explicit(&entry->state, memory_order_acquire); + if (state != ENTRY_VALID) continue; + + if (!cc_try_pin_reader(entry)) continue; + + state = atomic_load_explicit(&entry->state, memory_order_acquire); + if (state != ENTRY_VALID) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_release); + continue; + } + + char *key = atomic_load_explicit(&entry->key, memory_order_acquire); + size_t key_len = atomic_load_explicit(&entry->key_len, memory_order_acquire); + + const int match = + (key && key_len >= prefix_len && memcmp(key, prefix, prefix_len) == 0); + + /** we release reader ref before calling free_entry + * free_entry checks for active readers and aborts if any are held */ + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_acq_rel); + + if (match) + { + free_entry(cache, partition, entry); + count++; + } + } + } + + return count; +} + +size_t clock_cache_foreach_prefix(clock_cache_t *cache, const char *prefix, size_t prefix_len, + const clock_cache_foreach_callback_t callback, void *user_data) +{ + if (!cache || !prefix || prefix_len == 0 || !callback) return 0; + + size_t count = 0; + + for (size_t p = 0; p < cache->num_partitions; p++) + { + clock_cache_partition_t *partition = &cache->partitions[p]; + + for (size_t i = 0; i < partition->num_slots; i++) + { + clock_cache_entry_t *entry = &partition->slots[i]; + + /* we check if entry is valid */ + uint8_t state = atomic_load_explicit(&entry->state, memory_order_acquire); + if (state != ENTRY_VALID) continue; + + if (!cc_try_pin_reader(entry)) continue; + + /* we re-verify state after incrementing ref_bit */ + state = atomic_load_explicit(&entry->state, memory_order_acquire); + if (state != ENTRY_VALID) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_release); + continue; + } + + char *key_recheck = atomic_load_explicit(&entry->key, memory_order_acquire); + size_t key_len = atomic_load_explicit(&entry->key_len, memory_order_acquire); + if (!key_recheck || key_len < prefix_len) + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_release); + continue; + } + + /* we check prefix match */ + if (memcmp(key_recheck, prefix, prefix_len) == 0) + { + const uint8_t *payload = + atomic_load_explicit(&entry->payload, memory_order_acquire); + const size_t payload_len = + atomic_load_explicit(&entry->payload_len, memory_order_acquire); + + if (payload) + { + atomic_fetch_or_explicit(&entry->ref_bit, CLOCK_CACHE_REF_BIT, + memory_order_relaxed); + int result = callback(key_recheck, key_len, payload, payload_len, user_data); + count++; + + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_release); + + if (result != 0) return count; + } + else + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_release); + } + } + else + { + atomic_fetch_sub_explicit(&entry->ref_bit, CLOCK_CACHE_READER_INC, + memory_order_release); + } + } + } + + return count; +} \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/clock_cache.h b/storage/tidesdb/libtidesdb/src/clock_cache.h new file mode 100644 index 0000000000000..78c8e8761fd59 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/clock_cache.h @@ -0,0 +1,353 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __CLOCK_CACHE_H__ +#define __CLOCK_CACHE_H__ +#include "compat.h" + +/* forward declarations */ +typedef struct clock_cache_t clock_cache_t; +typedef struct clock_cache_partition_t clock_cache_partition_t; + +/** + * clock_cache_evict_fn + * callback function for custom cleanup when cache entry is evicted + * @param payload pointer to the payload being evicted + * @param payload_len length of the payload + */ +typedef void (*clock_cache_evict_fn)(void *payload, size_t payload_len); + +/** + * cache_config_t + * configuration for cache creation + * @param max_bytes maximum total bytes across all partitions + * @param num_partitions number of partitions (power of 2 recommended) + * @param slots_per_partition initial slots per partition + * @param evict_callback optional callback for custom cleanup on eviction (can be NULL) + */ +typedef struct +{ + size_t max_bytes; + size_t num_partitions; + size_t slots_per_partition; + size_t avg_entry_size; /* expected average entry size in bytes (0 = use default 100) */ + clock_cache_evict_fn evict_callback; +} cache_config_t; + +/** + * clock_cache_entry_t + * individual cache entry in a slot + * lock-free design using atomic state machine + * @param key atomic pointer to heap-allocated key + * @param payload atomic pointer to heap-allocated payload + * @param key_len atomic key length + * @param payload_len atomic payload length + * @param ref_bit atomic ref bit (LSB) plus reader count in upper bits + * @param state atomic state -- 0=empty, 1=writing, 2=valid, 3=deleting + * @param cached_hash cached hash value for this entry + * @param external_bytes caller-declared memory cost of pointed-to data + */ +typedef struct +{ + _Atomic(char *) key; + _Atomic(void *) payload; + atomic_size_t key_len; + atomic_size_t payload_len; + _Atomic(uint8_t) ref_bit; + _Atomic(uint8_t) state; + atomic_uint64_t cached_hash; + atomic_size_t external_bytes; +} clock_cache_entry_t; + +/** entry states */ +#define ENTRY_EMPTY 0 +#define ENTRY_WRITING 1 +#define ENTRY_VALID 2 +#define ENTRY_DELETING 3 + +/** cache configuration constants */ +#define CLOCK_CACHE_MAX_PUT_RETRIES 100 /* max retries for claiming a slot */ +#define CLOCK_CACHE_MIN_PARTITIONS 4 /* minimum number of partitions */ +#define CLOCK_CACHE_MAX_PARTITIONS 512 /* maximum number of partitions */ +#define CLOCK_CACHE_PARTITIONS_PER_CPU 4 /* partitions per CPU core */ +#define CLOCK_CACHE_MIN_SLOTS_PER_PARTITION 64 /* minimum slots per partition */ +#define CLOCK_CACHE_MAX_SLOTS_PER_PARTITION 8192 /* maximum slots per partition */ +#define CLOCK_CACHE_AVG_ENTRY_SIZE 100 /* estimated average entry size in bytes */ +/* hash index size = slots * 2 (2x, low load factor for fast probing) */ +#define CLOCK_CACHE_HASH_INDEX_MULTIPLIER_NUM 2 +#define CLOCK_CACHE_HASH_INDEX_MULTIPLIER_DEN 1 +#define CLOCK_CACHE_MAX_HASH_PROBE 128 /* max linear probing distance */ + +/** + * clock_cache_partition_t + * single partition + * uses hybrid design -- hash table for O(1) lookup + circular array for CLOCK eviction + * @param slots circular array of slots for CLOCK + * @param hash_index fixed-size hash index, hash --> slot_idx (-1 = empty) + * @param num_slots current number of slots (immutable after init) + * @param hash_index_size hash index size (2x num_slots for low collisions) + * @param hash_mask mask for fast modulo (immutable) + * @param slots_mask mask for fast modulo (num_slots - 1, power of 2) + * @param evict_threshold precomputed -- num_slots * 85 / 100 + * @param clock_hand atomic CLOCK hand position + * @param occupied_count atomic count of occupied slots + * @param bytes_used atomic bytes used in this partition + * @param hits per-partition hit counter (avoids false sharing on global counter) + * @param misses per-partition miss counter (avoids false sharing on global counter) + */ +struct clock_cache_partition_t +{ + /* cache line 0 -- cold, read-only after init */ + clock_cache_entry_t *slots; + _Atomic(int32_t) *hash_index; + size_t num_slots; + size_t hash_index_size; + size_t hash_mask; + size_t slots_mask; + size_t evict_threshold; + char _pad_cold[8]; /* pad to 64 bytes (keeps the hot atomics off this line) */ + + /* cache line 1 -- eviction path (writers/evictors only) */ + atomic_size_t clock_hand; + atomic_size_t occupied_count; + atomic_size_t bytes_used; + char _pad_evict[40]; /* pad to 64 bytes */ + + /* cache line 2 -- read-path stats (readers only) */ + atomic_uint64_t hits; + atomic_uint64_t misses; + char _pad_stats[48]; /* pad to 64 bytes */ +}; + +/** + * clock_cache_t + * main cache structure with partitions + * + * * PERFORMANCE NOTES ***** + * -- uses hybrid design -- hash table for O(1) lookup + circular array for CLOCK eviction + * -- hash table provides O(1) average-case lookups (with chaining for collisions) + * -- CLOCK array enables efficient second-chance eviction without reordering + * -- for high-performance workloads + * -- use 128-512 partitions for 16+ threads to minimize lock contention + * -- hash table size auto-scales to next power-of-2 >= slots_per_partition + * @param partitions array of partitions + * @param num_partitions number of partitions + * @param partition_mask mask for fast modulo (num_partitions - 1) + * @param max_bytes maximum total bytes + * @param total_bytes total bytes across all partitions + * @param hits cache hits + * @param misses cache misses + * @param shutdown shutdown flag -- prevents new operations + * @param evict_callback optional callback for custom cleanup on eviction (can be NULL) + * @param num_groups number of L3/CCX groups (1 on monolithic dies, 4 on Threadripper) + * @param partitions_per_group partitions per L3 group + * @param local_partition_mask partitions_per_group - 1 for fast modulo + * @param cpu_to_group CPU ID to L3 group mapping table + * @param max_cpus size of cpu_to_group table + */ +struct clock_cache_t +{ + clock_cache_partition_t *partitions; + size_t num_partitions; + size_t partition_mask; + size_t max_bytes; + atomic_size_t total_bytes; + atomic_uint64_t hits; + atomic_uint64_t misses; + _Atomic(uint8_t) shutdown; + clock_cache_evict_fn evict_callback; + size_t num_groups; + size_t partitions_per_group; + size_t local_partition_mask; + uint8_t *cpu_to_group; + int max_cpus; +}; + +/** + * clock_cache_stats_t + * cache statistics + * @param total_entries total number of entries + * @param total_bytes total bytes used + * @param hits cache hits + * @param misses cache misses + * @param hit_rate hit rate (hits / (hits + misses)) + * @param num_partitions number of partitions + */ +typedef struct +{ + size_t total_entries; + size_t total_bytes; + uint64_t hits; + uint64_t misses; + double hit_rate; + size_t num_partitions; +} clock_cache_stats_t; + +/** + * clock_cache_compute_config + * compute optimal cache configuration based on max_bytes and CPU count + * uses heuristics -- 1 partition per CPU core (up to 128), ~512 slots per partition + * @param max_bytes maximum total bytes for cache + * @param config output parameter for computed configuration + */ +void clock_cache_compute_config(size_t max_bytes, cache_config_t *config); + +/** + * clock_cache_create + * create a new cache with specified configuration + * @param config cache configuration + * @return pointer to new cache or NULL on failure + */ +clock_cache_t *clock_cache_create(const cache_config_t *config); + +/** + * clock_cache_destroy + * destroy the cache and free all resources + * @param cache the cache to destroy + */ +void clock_cache_destroy(clock_cache_t *cache); + +/** + * clock_cache_put + * insert or update a key-value pair + * @param cache the cache + * @param key the key + * @param key_len the key length + * @param payload the payload (can be any pointer type) + * @param payload_len the payload length + * @param external_bytes caller-declared memory cost of data pointed to by payload + * (e.g., heap-allocated block data). included in bytes_used accounting + * and eviction decisions. pass 0 if payload is self-contained. + * @return 0 on success, -1 on failure + */ +int clock_cache_put(clock_cache_t *cache, const char *key, size_t key_len, const void *payload, + size_t payload_len, size_t external_bytes); + +/** + * clock_cache_put_new + * insert a key-value pair that is known to not already exist in the cache + * skips the existing-entry lookup (find_entry_with_hash) for better performance + * on the cache-miss-then-populate path where we just confirmed the key is absent + * @param cache the cache + * @param key the key + * @param key_len the key length + * @param payload the payload + * @param payload_len the payload length + * @param external_bytes caller-declared memory cost of pointed-to data + * @return 0 on success, -1 on failure + */ +int clock_cache_put_new(clock_cache_t *cache, const char *key, size_t key_len, const void *payload, + size_t payload_len, size_t external_bytes); + +/** + * clock_cache_get + * retrieve a value by key (lock-free) + * @param cache the cache + * @param key the key + * @param key_len the key length + * @param payload_len output parameter for payload length + * @return allocated payload copy (caller must free) or NULL if not found + */ +uint8_t *clock_cache_get(clock_cache_t *cache, const char *key, size_t key_len, + size_t *payload_len); + +/** + * clock_cache_get_zero_copy + * retrieve a value by key without copying (zero-copy, lock-free) + * caller must call clock_cache_release() when done to decrement ref_bit + * @param cache the cache + * @param key the key + * @param key_len the key length + * @param payload_len output parameter for payload length + * @param entry_out output parameter for entry pointer (needed for release) + * @return pointer to cached payload (**do not free**) or NULL if not found + */ +const uint8_t *clock_cache_get_zero_copy(clock_cache_t *cache, const char *key, size_t key_len, + size_t *payload_len, clock_cache_entry_t **entry_out); + +/** + * clock_cache_release + * release a zero-copy reference obtained from clock_cache_get_zero_copy + * @param entry the entry pointer from clock_cache_get_zero_copy + */ +void clock_cache_release(clock_cache_entry_t *entry); + +/** + * clock_cache_delete + * remove a key-value pair from cache + * @param cache the cache + * @param key the key + * @param key_len the key length + * @return 0 on success, -1 if not found + */ +int clock_cache_delete(clock_cache_t *cache, const char *key, size_t key_len); + +/** + * clock_cache_clear + * remove all entries from cache + * @param cache the cache + */ +void clock_cache_clear(clock_cache_t *cache); + +/** + * clock_cache_get_stats + * get cache statistics + * @param cache the cache + * @param stats output parameter for statistics + */ +void clock_cache_get_stats(clock_cache_t *cache, clock_cache_stats_t *stats); + +/** + * clock_cache_foreach_callback_t + * callback function for iterating over cache entries + * @param key the entry key + * @param key_len the key length + * @param payload the entry payload + * @param payload_len the payload length + * @param user_data user data passed from caller + * @return 0 to continue iteration, non-zero to stop + */ +typedef int (*clock_cache_foreach_callback_t)(const char *key, size_t key_len, + const uint8_t *payload, size_t payload_len, + void *user_data); + +/** + * clock_cache_foreach_prefix + * iterate over all entries matching a key prefix + * @param cache the cache + * @param prefix the key prefix to match + * @param prefix_len the prefix length + * @param callback function to call for each matching entry (return 0 to continue, non-zero to stop) + * @param user_data user data passed to callback + * @return number of entries processed + */ +size_t clock_cache_foreach_prefix(clock_cache_t *cache, const char *prefix, size_t prefix_len, + clock_cache_foreach_callback_t callback, void *user_data); + +/** + * clock_cache_delete_by_prefix + * delete all entries matching a key prefix + * unlike foreach_prefix + delete, this correctly releases reader refs before deletion + * @param cache the cache + * @param prefix the key prefix to match + * @param prefix_len the prefix length + * @return number of entries deleted + */ +size_t clock_cache_delete_by_prefix(clock_cache_t *cache, const char *prefix, size_t prefix_len); + +#endif /* __CLOCK_CACHE_H__ */ \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/compat.h b/storage/tidesdb/libtidesdb/src/compat.h new file mode 100644 index 0000000000000..87681e177ee31 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/compat.h @@ -0,0 +1,3579 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __COMPAT_H__ +#define __COMPAT_H__ + +/* compat header for multi-platform support (Windows, POSIX, posix includes macOS) */ +#include +#include +#include +#include +#include + +/* fallback for SIZE_MAX, just in case */ +#ifndef SIZE_MAX +#define SIZE_MAX ((size_t)-1) +#endif + +#include +#include + +#ifndef _WIN32 +#include +#endif + +#ifdef _WIN32 +/* require Windows Vista+ APIs (SetFileInformationByHandle, FILE_ALLOCATION_INFO, + * FILE_END_OF_FILE_INFO) used by tdb_preallocate_extent. defined before any + * windows.h include below so the right structure declarations are visible. */ +#if !defined(_WIN32_WINNT) || _WIN32_WINNT < 0x0600 +#undef _WIN32_WINNT +#define _WIN32_WINNT 0x0600 +#endif +#if !defined(WINVER) || WINVER < 0x0600 +#undef WINVER +#define WINVER 0x0600 +#endif +#define PATH_SEPARATOR "\\" +#else +#define PATH_SEPARATOR "/" +#endif + +/* cross-platform line buffering -- Windows doesn't support _IOLBF properly with NULL buffer */ +#if defined(_MSC_VER) +#define tdb_setlinebuf(stream) setvbuf((stream), NULL, _IONBF, 0) +#else +#define tdb_setlinebuf(stream) setvbuf((stream), NULL, _IOLBF, 0) +#endif + +/* branch prediction hints for hot paths */ +#if defined(__GNUC__) || defined(__clang__) +#define TDB_LIKELY(x) __builtin_expect(!!(x), 1) +#define TDB_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define TDB_LIKELY(x) (x) +#define TDB_UNLIKELY(x) (x) +#endif + +/* cross-platform fabs abstraction */ +#include +#if defined(_MSC_VER) +#define tdb_fabs(x) fabs(x) +#elif defined(__APPLE__) +/* macOS may require explicit declaration in some contexts */ +#define tdb_fabs(x) fabs(x) +#else +/* POSIX systems */ +#define tdb_fabs(x) fabs(x) +#endif + +/* cross-platform fsync abstraction */ +#if defined(_WIN32) +#include +#define tdb_fsync(fd) _commit(fd) +#else +#include +#define tdb_fsync(fd) fsync(fd) +#endif + +/* file lock error codes */ +#define TDB_LOCK_SUCCESS 0 /* lock acquired successfully */ +#define TDB_LOCK_HELD 1 /* lock is held by another process (EWOULDBLOCK/EAGAIN) */ +#define TDB_LOCK_ERROR 2 /* irrecoverable error */ + +/* default retry count for EINTR during lock acquisition */ +#define TDB_LOCK_DEFAULT_RETRIES 3 + +/* cross-platform file locking abstraction for database directory lock */ +#if defined(_WIN32) +#include +#include +#include + +/* + * tdb_open_lock_file + * opens a lock file (windows version -- lock acquired separately) + * @param path the path to the lock file + * @param lock_result output -- TDB_LOCK_SUCCESS on successful open (lock not yet acquired) + * @return file descriptor on success (>= 0), -1 on error + */ +static inline int tdb_open_lock_file(const char *path, int *lock_result) +{ + int fd = _open(path, _O_RDWR | _O_CREAT | _O_BINARY, 0644); + if (fd < 0) + { + *lock_result = TDB_LOCK_ERROR; + return -1; + } + *lock_result = TDB_LOCK_SUCCESS; /* caller will call tdb_file_lock_exclusive */ + return fd; +} + +/* + * tdb_file_lock_exclusive + * acquires an exclusive lock on a file (non-blocking) + * @param fd the file descriptor to lock + * @param max_retries maximum retries for transient errors (i.e., signal interrupts) + * @return TDB_LOCK_SUCCESS on success, + * TDB_LOCK_HELD if lock is held by another process, + * TDB_LOCK_ERROR on irrecoverable error + */ +static inline int tdb_file_lock_exclusive(int fd, int max_retries) +{ + (void)max_retries; /* windows with LOCKFILE_FAIL_IMMEDIATELY has no retryable errs */ + + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) return TDB_LOCK_ERROR; + + OVERLAPPED ov = {0}; + if (LockFileEx(h, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, 1, 0, &ov)) + { + return TDB_LOCK_SUCCESS; + } + + /* with LOCKFILE_FAIL_IMMEDIATELY, ERROR_LOCK_VIOLATION means lock is held + **** https://learn.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-lockfileex */ + DWORD err = GetLastError(); + if (err == ERROR_LOCK_VIOLATION) + { + return TDB_LOCK_HELD; + } + return TDB_LOCK_ERROR; +} + +/* + * tdb_file_unlock + * releases a lock on a file + * @param fd the file descriptor to unlock + * @return 0 on success, -1 on error + */ +static inline int tdb_file_unlock(int fd) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) return -1; + + OVERLAPPED ov = {0}; + if (!UnlockFileEx(h, 0, 1, 0, &ov)) + { + return -1; + } + return 0; +} +#else +#include +#include + +/*** linux 3.15+ supports F_OFD_SETLK (Open File Description locks) which are per-fd + * and have sane semantics. we should utilize these when available, and otherwise fall back to + * fcntl() F_SETLK. https://lwn.net/Articles/640404/ and https://apenwarr.ca/log/20101213 + * + * macOS/BSD -- We use fcntl() F_SETLK which has per-process semantics. Critically, fcntl() locks + * are not inherited across fork(), so child processes will properly fail to acquire the lock. + * flock() was considered but locks persist across fork(), causing the child to inherit the lock + * and then block when trying to acquire a new lock on a different fd. + * https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/flock.2.html + */ +#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__DragonFly__) +#define TDB_USE_FLOCK 0 +#define TDB_USE_FCNTL_SETLK 1 +#include +#elif !defined(F_OFD_SETLK) +#define TDB_USE_FLOCK 1 +#define TDB_USE_FCNTL_SETLK 0 +#include +#else +#define TDB_USE_FLOCK 0 +#define TDB_USE_FCNTL_SETLK 0 +#endif + +/* + * tdb_open_lock_file + * opens a lock file for locking (lock acquired separately via tdb_file_lock_exclusive) + * @param path the path to the lock file + * @param lock_result output -- TDB_LOCK_SUCCESS, TDB_LOCK_HELD, or TDB_LOCK_ERROR + * @return file descriptor on success (>= 0), -1 on error + */ +static inline int tdb_open_lock_file(const char *path, int *lock_result) +{ + /* open the lock file */ + int fd = open(path, O_RDWR | O_CREAT | O_CLOEXEC, 0644); + if (fd < 0) + { + *lock_result = TDB_LOCK_ERROR; + return -1; + } + +#if TDB_USE_FCNTL_SETLK + /* fcntl() F_SETLK allows same-process re-locking, so check PID file first. + * read PID before acquiring lock to detect same-process double-open. */ + char pid_buf[32] = {0}; + ssize_t n = pread(fd, pid_buf, sizeof(pid_buf) - 1, 0); + if (n > 0) + { + pid_t file_pid = (pid_t)atol(pid_buf); + if (file_pid == getpid()) + { + /* same process already holds lock */ + close(fd); + *lock_result = TDB_LOCK_HELD; + return -1; + } + } +#endif + + *lock_result = TDB_LOCK_SUCCESS; + return fd; +} + +#if TDB_USE_FCNTL_SETLK +/* + * tdb_file_lock_write_pid + * writes the current PID to the lock file after acquiring the lock + * @param fd the file descriptor of the lock file + */ +static inline void tdb_file_lock_write_pid(const int fd) +{ + char our_pid[32]; + int len = snprintf(our_pid, sizeof(our_pid), "%d\n", (int)getpid()); + if (ftruncate(fd, 0) == 0) + { + (void)pwrite(fd, our_pid, len, 0); + } +} + +/* + * tdb_file_lock_clear_pid + * clears the PID from the lock file before releasing the lock + * @param fd the file descriptor of the lock file + */ +static inline void tdb_file_lock_clear_pid(const int fd) +{ + (void)ftruncate(fd, 0); +} +#endif + +/* + * tdb_file_lock_exclusive + * acquires an exclusive lock on a file (non-blocking) + * uses fcntl() F_SETLK on macOS/BSD (locks not inherited across fork) + * uses flock() on older systems without F_OFD_SETLK + * uses F_OFD_SETLK on linux 3.15+ for per-fd locking + * @param fd the file descriptor to lock + * @param max_retries maximum retries for EINTR (signal interrupts) + * @return TDB_LOCK_SUCCESS on success, + * TDB_LOCK_HELD if lock is held by another process, + * TDB_LOCK_ERROR on irrecoverable error + */ +static inline int tdb_file_lock_exclusive(const int fd, int max_retries) +{ + int retries = 0; + if (max_retries <= 0) max_retries = TDB_LOCK_DEFAULT_RETRIES; + +#if TDB_USE_FCNTL_SETLK + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 0; + fl.l_pid = 0; + + while (retries <= max_retries) + { + if (fcntl(fd, F_SETLK, &fl) == 0) + { + /* we write PID to lock file for same-process detection */ + tdb_file_lock_write_pid(fd); + return TDB_LOCK_SUCCESS; + } + + int err = errno; + +#if EWOULDBLOCK == EAGAIN + if (err == EWOULDBLOCK || err == EACCES) +#else + if (err == EWOULDBLOCK || err == EAGAIN || err == EACCES) +#endif + { + return TDB_LOCK_HELD; + } + if (err == EINTR) + { + retries++; + continue; + } + return TDB_LOCK_ERROR; + } + return TDB_LOCK_ERROR; +#elif TDB_USE_FLOCK + while (retries <= max_retries) + { + if (flock(fd, LOCK_EX | LOCK_NB) == 0) + { + return TDB_LOCK_SUCCESS; + } + + int err = errno; + +#if EWOULDBLOCK == EAGAIN + if (err == EWOULDBLOCK || err == EACCES) +#else + if (err == EWOULDBLOCK || err == EAGAIN || err == EACCES) +#endif + { + return TDB_LOCK_HELD; + } + if (err == EINTR) + { + retries++; + continue; + } + return TDB_LOCK_ERROR; + } + return TDB_LOCK_ERROR; +#else + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 0; + fl.l_pid = 0; /* ignored for OFD locks */ + + while (retries <= max_retries) + { + if (fcntl(fd, F_OFD_SETLK, &fl) == 0) + { + return TDB_LOCK_SUCCESS; + } + + int err = errno; + +#if EWOULDBLOCK == EAGAIN + if (err == EWOULDBLOCK || err == EACCES) +#else + if (err == EWOULDBLOCK || err == EAGAIN || err == EACCES) +#endif + { + return TDB_LOCK_HELD; + } + if (err == EINTR) + { + retries++; + continue; + } + return TDB_LOCK_ERROR; + } + return TDB_LOCK_ERROR; +#endif +} + +/* + * tdb_file_unlock + * releases a lock on a file + * @param fd the file descriptor to unlock + * @return 0 on success, -1 on error + */ +static inline int tdb_file_unlock(const int fd) +{ +#if TDB_USE_FCNTL_SETLK + tdb_file_lock_clear_pid(fd); + + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_type = F_UNLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 0; + fl.l_pid = 0; + + if (fcntl(fd, F_SETLK, &fl) != 0) + { + return -1; + } + return 0; +#elif TDB_USE_FLOCK + if (flock(fd, LOCK_UN) != 0) + { + return -1; + } + return 0; +#else + /* linux with F_OFD_SETLK */ + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_type = F_UNLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 0; + fl.l_pid = 0; + + if (fcntl(fd, F_OFD_SETLK, &fl) != 0) + { + return -1; + } + return 0; +#endif +} +#endif + +/* cross-platform localtime abstraction */ +#if defined(_WIN32) +/* (MSVC and MinGW) use localtime_s with reversed parameter order */ +#define tdb_localtime(timer, result) localtime_s((result), (timer)) +#else +/* POSIX uses localtime_r */ +#define tdb_localtime(timer, result) localtime_r((timer), (result)) +#endif + +/* https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/stat-functions?view=msvc-170 + * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/fstat-fstat32-fstat64-fstati64-fstat32i64-fstat64i32?view=msvc-170 + * to handle the compiler differences + */ +#if defined(_WIN32) +#include +#include + +#if defined(_MSC_VER) +#define STAT_STRUCT _stat64 +#define STAT_FUNC _stat64 +#define FSTAT_FUNC _fstat64 +#else +#define STAT_STRUCT stat +#define STAT_FUNC stat +#define FSTAT_FUNC fstat +#endif + +#else /* posix */ +#include +#include +#define STAT_STRUCT stat +#define STAT_FUNC stat +#define FSTAT_FUNC fstat +#endif + +#if !defined(_MSC_VER) || _MSC_VER >= 1930 +#include +typedef atomic_size_t atomic_size_t; +typedef atomic_uint_fast64_t atomic_uint64_t; +#endif + +#if defined(__MINGW32__) || defined(__MINGW64__) +#define TDB_SIZE_FMT "%llu" +#define TDB_U64_FMT "%llu" +#define TDB_SIZE_CAST(x) ((unsigned long long)(x)) +#define TDB_U64_CAST(x) ((unsigned long long)(x)) +#else +#define TDB_SIZE_FMT "%zu" +#define TDB_U64_FMT "%" PRIu64 +#define TDB_SIZE_CAST(x) ((size_t)(x)) +#define TDB_U64_CAST(x) ((uint64_t)(x)) +#endif + +/* cross-platform atomic alignment */ +#if defined(_MSC_VER) +#define ATOMIC_ALIGN(n) __declspec(align(n)) +#elif defined(__GNUC__) || defined(__clang__) +#define ATOMIC_ALIGN(n) __attribute__((aligned(n))) +#else +#define ATOMIC_ALIGN(n) +#endif + +/* cross-platform unused attribute for static functions */ +#if defined(__GNUC__) || defined(__clang__) +#define UNUSED __attribute__((unused)) +#else +#define UNUSED +#endif + +/* cross-platform thread-local storage */ +#if defined(_MSC_VER) +#define THREAD_LOCAL __declspec(thread) +#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define THREAD_LOCAL _Thread_local +#elif defined(__GNUC__) || defined(__clang__) +#define THREAD_LOCAL __thread +#else +#define THREAD_LOCAL /* fallback -- no thread-local support */ +#endif + +/* cross-platform prefetch hints for cache optimization */ +#if defined(__GNUC__) || defined(__clang__) +/* __builtin_prefetch(addr, rw, locality) + * rw -- 0 = read, 1 = write + * locality-- 0 = no temporal locality, 3 = high temporal locality */ +#define PREFETCH_READ(addr) __builtin_prefetch((addr), 0, 3) +#define PREFETCH_WRITE(addr) __builtin_prefetch((addr), 1, 3) +#elif defined(_MSC_VER) +#include +#define PREFETCH_READ(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0) +#define PREFETCH_WRITE(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0) +#else +/* no prefetch support -- define as no-op */ +#define PREFETCH_READ(addr) ((void)0) +#define PREFETCH_WRITE(addr) ((void)0) +#endif + +/* cross-platform count trailing zeros for 64-bit integers */ +#if defined(__GNUC__) || defined(__clang__) +#define TDB_CTZ64(x) __builtin_ctzll(x) +#elif defined(_MSC_VER) +/* + * tdb_ctz64_msvc + * counts trailing zeros in a 64-bit integer (MSVC version) + * @param x the value to count trailing zeros in + * @return number of trailing zero bits (0-63), or 64 if x is 0 + */ +static inline int tdb_ctz64_msvc(uint64_t x) +{ + unsigned long index; +#if defined(_WIN64) + if (_BitScanForward64(&index, x)) + { + return (int)index; + } +#else + /* 32-bit MSVC-- check low and high 32-bit halves */ + if (_BitScanForward(&index, (unsigned long)x)) + { + return (int)index; + } + if (_BitScanForward(&index, (unsigned long)(x >> 32))) + { + return (int)(index + 32); + } +#endif + return 64; /* all zeros */ +} +#define TDB_CTZ64(x) tdb_ctz64_msvc(x) +#else +/* portable fallback using de Bruijn sequence */ +/* + * tdb_ctz64_portable + * counts trailing zeros in a 64-bit integer (portable version) + * @param x the value to count trailing zeros in + * @return number of trailing zero bits (0-63), or 64 if x is 0 + */ +static inline int tdb_ctz64_portable(uint64_t x) +{ + if (x == 0) return 64; + static const int debruijn_table[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, 62, 5, 39, 46, 44, 42, + 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, + 23, 58, 17, 10, 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12}; + return debruijn_table[((x & -x) * 0x022FDD63CC95386DULL) >> 58]; +} +#define TDB_CTZ64(x) tdb_ctz64_portable(x) +#endif + +/* cross-platform thread ID for unique file naming */ +#if defined(_WIN32) +#include +#define TDB_THREAD_ID() ((unsigned long)GetCurrentThreadId()) +#else +#include +#define TDB_THREAD_ID() ((unsigned long)pthread_self()) +#endif + +/* cross-platform process ID */ +#if defined(_WIN32) +#include +#define TDB_GETPID() _getpid() +#else +#include +#include +#define TDB_GETPID() getpid() +#endif + +/** + * tdb_spawn_wait + * spawn a child process running cmd with the given argument vector and block + * until it exits. argv is NULL terminated and argv[0] is the program name. + * cmd is resolved like execvp, a PATH search when it contains no separator, + * and _spawnvp applies the same resolution on Windows. + * @param cmd executable to run + * @param argv NULL-terminated argument vector, argv[0] is the program name + * @return the child exit code on a normal exit, -1 on spawn failure or an + * abnormal exit + */ +static inline int tdb_spawn_wait(const char *cmd, char *const argv[]) +{ +#ifdef _WIN32 + intptr_t rc = _spawnvp(_P_WAIT, cmd, (const char *const *)argv); + return (rc < 0) ? -1 : (int)rc; +#else + pid_t pid = fork(); + if (pid < 0) return -1; + if (pid == 0) + { + execvp(cmd, argv); + _exit(127); /* execvp only returns on failure */ + } + int status = 0; + if (waitpid(pid, &status, 0) < 0) return -1; + if (WIFEXITED(status)) return WEXITSTATUS(status); + return -1; +#endif +} + +#ifdef _WIN32 +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable : 4996) /* disable deprecated warning for windows */ +#pragma warning(disable : 4029) /* declared formal parameter list different from definition */ +#pragma warning(disable : 4211) /* nonstandard extension used-- redefined extern to static */ +#endif + +#if defined(__MINGW32__) || defined(__MINGW64__) +/* mingw provides POSIX-like headers */ +#include +#include +#include +#include + +/* mingw mkdir only takes one argument, create a wrapper for POSIX compatibility */ +#define mkdir(path, mode) _mkdir(path) +#else +/* msvc needs pthreads-win32 library */ +#include "pthread.h" +#endif + +#if defined(_MSC_VER) +#ifndef _OFF_T_DEFINED +#define _OFF_T_DEFINED +typedef __int64 off_t; +#endif + +#ifndef _SSIZE_T_DEFINED +#define _SSIZE_T_DEFINED +typedef __int64 ssize_t; +#endif + +#ifndef _MODE_T_DEFINED +#define _MODE_T_DEFINED +typedef int mode_t; +#endif + +/* ftruncate for windows */ +/* + * ftruncate + * @param fd the file descriptor to truncate + * @param length the new length of the file + * @return 0 on success, -1 on failure + */ +static inline int ftruncate(int fd, off_t length) +{ + return _chsize_s(fd, length); +} + +/* open for windows */ +/* + * open + * @param path the path to open + * @param flags the flags to use + * @param mode the mode to use (only used if O_CREAT is set) + * @return the file descriptor on success, -1 on failure + */ +static inline int _tidesdb_open_wrapper_3(const char *path, int flags, mode_t mode) +{ + return _sopen(path, flags | _O_BINARY | _O_SEQUENTIAL, _SH_DENYNO, mode); +} + +/* open for windows */ +/* + * open + * @param path the path to open + * @param flags the flags to use + * @return the file descriptor on success, -1 on failure + */ +static inline int _tidesdb_open_wrapper_2(const char *path, int flags) +{ + return _sopen(path, flags | _O_BINARY, _SH_DENYNO, 0); +} +#define open(...) _tidesdb_open_wrapper_3(__VA_ARGS__) + +/* C11 atomics support */ +#if defined(__MINGW32__) || defined(__GNUC__) +/* mingw and GCC have proper C11 stdatomic.h support */ +#include +#elif _MSC_VER < 1930 +/* MSVC < 2022 doesn't have stdatomic.h -- use Windows Interlocked functions */ +typedef volatile LONG atomic_int; +typedef volatile LONGLONG atomic_size_t; +typedef volatile LONGLONG atomic_uint64_t; +#define _Atomic(T) volatile T + +#ifdef _WIN64 +/* 64-bit atomic store */ +/* + * atomic_store_explicit + * @param ptr the pointer to store the value at + * @param val the value to store + * @param order the memory order (unused) + */ +#define atomic_store_explicit(ptr, val, order) \ + do \ + { \ + if (sizeof(*(ptr)) == sizeof(void *)) \ + { \ + InterlockedExchangePointer((PVOID volatile *)(ptr), (PVOID)(uintptr_t)(val)); \ + } \ + else if (sizeof(*(ptr)) == 8) \ + { \ + InterlockedExchange64((LONGLONG volatile *)(ptr), (LONGLONG)(uintptr_t)(val)); \ + } \ + else if (sizeof(*(ptr)) == 4) \ + { \ + InterlockedExchange((LONG volatile *)(ptr), (LONG)(uintptr_t)(val)); \ + } \ + else \ + { \ + *(ptr) = (val); \ + } \ + } while (0) +#else +/* 32-bit atomic store */ +/* + * atomic_store_explicit + * @param ptr the pointer to store the value at + * @param val the value to store + * @param order the memory order (unused) + */ +#define atomic_store_explicit(ptr, val, order) \ + do \ + { \ + if (sizeof(*(ptr)) == sizeof(void *)) \ + { \ + InterlockedExchangePointer((PVOID volatile *)(ptr), (PVOID)(uintptr_t)(val)); \ + } \ + else if (sizeof(*(ptr)) == 8) \ + { \ + /* 64-bit value on a 32-bit target cast straight to LONGLONG, NOT via \ + * uintptr_t (4 bytes here) which would truncate the input */ \ + InterlockedExchange64((LONGLONG volatile *)(ptr), (LONGLONG)(val)); \ + } \ + else if (sizeof(*(ptr)) == 4) \ + { \ + InterlockedExchange((LONG volatile *)(ptr), (LONG)(uintptr_t)(val)); \ + } \ + else \ + { \ + *(ptr) = (val); \ + } \ + } while (0) +#endif + +/* atomic load */ +/* + * _atomic_load_ptr + * @param ptr the pointer to load the value from + * @return the value loaded from the pointer + */ +static inline void *_atomic_load_ptr(volatile void *const *ptr) +{ + return (void *)InterlockedCompareExchangePointer((PVOID volatile *)ptr, NULL, NULL); +} + +/* atomic load -- available on both _WIN64 and 32-bit (InterlockedCompareExchange64 + * is provided on 32-bit Windows too, so 64-bit atomics work on a 32-bit target) */ +/* + * _atomic_load_i64 + * @param ptr the pointer to load the value from + * @return the value loaded from the pointer + */ +static inline LONGLONG _atomic_load_i64(volatile LONGLONG *ptr) +{ + return InterlockedCompareExchange64((LONGLONG volatile *)ptr, 0, 0); +} + +/* atomic load */ +/* + * _atomic_load_i32 + * @param ptr the pointer to load the value from + * @return the value loaded from the pointer + */ +static inline LONG _atomic_load_i32(volatile LONG *ptr) +{ + return InterlockedCompareExchange((LONG volatile *)ptr, 0, 0); +} + +/* atomic load */ +/* + * _atomic_load_u8 + * @param ptr the pointer to load the value from + * @return the value loaded from the pointer + */ +static inline unsigned char _atomic_load_u8(volatile unsigned char *ptr) +{ + return *ptr; /* byte reads are atomic on x86/x64 */ +} + +#ifdef _WIN64 +/* atomic load */ +/* + * atomic_load_explicit + * @param ptr the pointer to load the value from + * @param order the memory order (unused) + * @return the value loaded from the pointer + */ +#define atomic_load_explicit(ptr, order) \ + (sizeof(*(ptr)) == sizeof(void *) ? _atomic_load_ptr((volatile void *const *)(ptr)) \ + : sizeof(*(ptr)) == 8 ? (void *)(uintptr_t)_atomic_load_i64((volatile LONGLONG *)(ptr)) \ + : sizeof(*(ptr)) == 4 ? (void *)(uintptr_t)_atomic_load_i32((volatile LONG *)(ptr)) \ + : (void *)(uintptr_t)_atomic_load_u8((volatile unsigned char *)(ptr))) +#else +/* atomic load */ +/* + * atomic_load_explicit + * @param ptr the pointer to load the value from + * @param order the memory order (unused) + * @return the value loaded from the pointer + */ +/* NOTE (32-bit MSVC < 2022) this path returns unsigned long long, not void*, so a + * 64-bit atomic (sizeof==8, e.g. atomic_uint64_t / atomic_size_t) is loaded at full + * width -- routing it through (void*)(uintptr_t) as the _WIN64 path does would truncate + * to 32 bits here. Pointer and 32-bit values widen losslessly. A caller assigning the + * result to a pointer gets an integer->pointer conversion (cast as needed). + * This whole 32-bit MSVC<2022 atomics path MUST be compiled and tested on the target. */ +#define atomic_load_explicit(ptr, order) \ + (sizeof(*(ptr)) == sizeof(void *) \ + ? (unsigned long long)(uintptr_t)_atomic_load_ptr((volatile void *const *)(ptr)) \ + : sizeof(*(ptr)) == 8 ? (unsigned long long)_atomic_load_i64((volatile LONGLONG *)(ptr)) \ + : sizeof(*(ptr)) == 4 \ + ? (unsigned long long)(uintptr_t)_atomic_load_i32((volatile LONG *)(ptr)) \ + : (unsigned long long)_atomic_load_u8((volatile unsigned char *)(ptr))) +#endif + +/* atomic exchange */ +#ifdef _WIN64 +/* atomic exchange */ +/* + * atomic_exchange_explicit + * @param ptr the pointer to exchange the value at + * @param val the value to exchange + * @param order the memory order (unused) + * @return the value exchanged from the pointer + */ +#define atomic_exchange_explicit(ptr, val, order) \ + (sizeof(*(ptr)) == sizeof(void *) \ + ? InterlockedExchangePointer((PVOID volatile *)(ptr), (PVOID)(uintptr_t)(val)) \ + : sizeof(*(ptr)) == 8 \ + ? (void *)(uintptr_t)InterlockedExchange64((LONGLONG volatile *)(ptr), \ + (LONGLONG)(uintptr_t)(val)) \ + : (void *)(uintptr_t)InterlockedExchange((LONG volatile *)(ptr), (LONG)(uintptr_t)(val))) +#else +/* atomic exchange */ +/* + * atomic_exchange_explicit + * @param ptr the pointer to exchange the value at + * @param val the value to exchange + * @param order the memory order (unused) + * @return the value exchanged from the pointer + */ +/* NOTE (32-bit MSVC < 2022) returns unsigned long long for the same reason as + * atomic_load_explicit above -- the 8-byte arm must not truncate. Verify on target. */ +#define atomic_exchange_explicit(ptr, val, order) \ + (sizeof(*(ptr)) == sizeof(void *) ? (unsigned long long)(uintptr_t)InterlockedExchangePointer( \ + (PVOID volatile *)(ptr), (PVOID)(uintptr_t)(val)) \ + : sizeof(*(ptr)) == 8 \ + ? (unsigned long long)InterlockedExchange64((LONGLONG volatile *)(ptr), (LONGLONG)(val)) \ + : (unsigned long long)(uintptr_t)InterlockedExchange((LONG volatile *)(ptr), \ + (LONG)(uintptr_t)(val))) +#endif + +#ifdef _WIN64 +/* atomic fetch add */ +/* + * atomic_fetch_add + * @param ptr the pointer to add the value to + * @param val the value to add + * @return the value before the addition + */ +#define atomic_fetch_add(ptr, val) \ + InterlockedExchangeAdd64((LONGLONG volatile *)(ptr), (LONGLONG)(val)) +#else +/* atomic fetch add */ +/* + * atomic_fetch_add + * @param ptr the pointer to add the value to + * @param val the value to add + * @return the value before the addition + */ +/* 32-bit dispatch on width so an 8-byte counter (atomic_uint64_t / atomic_size_t) + * uses the 64-bit intrinsic instead of truncating to LONG. Returns unsigned long long. */ +#define atomic_fetch_add(ptr, val) \ + (sizeof(*(ptr)) == 8 ? (unsigned long long)InterlockedExchangeAdd64( \ + (LONGLONG volatile *)(ptr), (LONGLONG)(val)) \ + : (unsigned long long)(unsigned long)InterlockedExchangeAdd( \ + (LONG volatile *)(ptr), (LONG)(val))) +#endif + +/* atomic store */ +/* + * atomic_store + * @param ptr the pointer to store the value at + * @param val the value to store + */ +#define atomic_store(ptr, val) atomic_store_explicit(ptr, val, memory_order_seq_cst) +/* atomic load */ +/* + * atomic_load + * @param ptr the pointer to load the value from + * @return the value loaded from the pointer + */ +#define atomic_load(ptr) atomic_load_explicit(ptr, memory_order_seq_cst) +#define memory_order_relaxed 0 +#define memory_order_acquire 1 +#define memory_order_release 2 +#define memory_order_seq_cst 3 + +/* atomic compare exchange for pointers (MSVC compatibility) */ +/* + * atomic_compare_exchange_strong_ptr + * @param ptr pointer to atomic pointer + * @param expected pointer to expected value + * @param desired new value to store + * @return 1 if successful, 0 if failed + */ +static inline int atomic_compare_exchange_strong_ptr(void *volatile *ptr, void **expected, + void *desired) +{ + void *old = + InterlockedCompareExchangePointer((PVOID volatile *)ptr, (PVOID)desired, (PVOID)*expected); + if (old == *expected) + { + return 1; + } + *expected = old; + return 0; +} + +#endif /* _MSC_VER < 1930 */ + +/* access flags are normally defined in unistd.h, which unavailable under MSVC + * + * instead, define the flags as documented at + * https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/access-waccess */ +#ifndef F_OK +#define F_OK 00 +#endif +#ifndef W_OK +#define W_OK 02 +#endif +#ifndef R_OK +#define R_OK 04 +#endif +#endif + +#ifndef O_RDWR +#define O_RDWR _O_RDWR +#endif +#ifndef O_CREAT +#define O_CREAT _O_CREAT +#endif +#ifndef O_RDONLY +#define O_RDONLY _O_RDONLY +#endif +#ifndef O_WRONLY +#define O_WRONLY _O_WRONLY +#endif +#ifndef O_BINARY +#define O_BINARY _O_BINARY +#endif +#ifndef O_SEQUENTIAL +#define O_SEQUENTIAL _O_SEQUENTIAL +#endif + +#ifndef M_LN2 +#define M_LN2 0.69314718055994530942 /* log_e 2 */ +#endif + +#if defined(_MSC_VER) +#define CLOCK_REALTIME 0 +#define CLOCK_MONOTONIC 1 + +struct timezone +{ + int tz_minuteswest; + int tz_dsttime; +}; + +struct dirent +{ + char d_name[MAX_PATH]; +}; + +typedef struct +{ + HANDLE hFind; + WIN32_FIND_DATA findFileData; + struct dirent dirent; +} DIR; + +/* mkdir */ +/* + * mkdir + * @param path the path to create the directory at + * @param mode the mode to create the directory with (unused on windows) + * @return 0 on success, -1 on failure + */ +static inline int mkdir(const char *path, mode_t mode) +{ + (void)mode; /* unused on windows */ + return _mkdir(path); +} + +/* opendir */ +/* + * opendir + * @param name the name of the directory to open + * @return a pointer to the directory stream, or NULL on failure + */ +static inline DIR *opendir(const char *name) +{ + DIR *dir = (DIR *)malloc(sizeof(DIR)); + if (dir == NULL) + { + errno = ENOMEM; + return NULL; + } + char search_path[MAX_PATH]; + snprintf(search_path, MAX_PATH, "%s\\*", name); + dir->hFind = FindFirstFile(search_path, &dir->findFileData); + if (dir->hFind == INVALID_HANDLE_VALUE) + { + free(dir); + return NULL; + } + return dir; +} + +/* readdir */ +/* + * readdir + * @param dir the directory stream to read from + * @return a pointer to the next directory entry, or NULL on failure + */ +static inline struct dirent *readdir(DIR *dir) +{ + if (dir == NULL || dir->hFind == INVALID_HANDLE_VALUE) + { + return NULL; + } + if (dir->findFileData.cFileName[0] == '\0') + { + if (!FindNextFile(dir->hFind, &dir->findFileData)) + { + return NULL; + } + } + strncpy(dir->dirent.d_name, dir->findFileData.cFileName, MAX_PATH); + dir->findFileData.cFileName[0] = '\0'; /* reset */ + return &dir->dirent; +} + +/* closedir */ +/* + * closedir + * @param dir the directory stream to close + * @return 0 on success, -1 on failure + */ +static inline int closedir(DIR *dir) +{ + if (dir == NULL) + { + return -1; + } + if (dir->hFind != INVALID_HANDLE_VALUE) + { + FindClose(dir->hFind); + } + free(dir); + return 0; +} + +typedef struct +{ + HANDLE handle; +} sem_t; + +/* sem_init */ +/* + * sem_init + * @param sem the semaphore to initialize + * @param pshared whether the semaphore is shared between processes (unused on windows) + * @param value the initial value of the semaphore + * @return 0 on success, -1 on failure + */ +static inline int sem_init(sem_t *sem, int pshared, unsigned int value) +{ + (void)pshared; + sem->handle = CreateSemaphore(NULL, value, LONG_MAX, NULL); + if (sem->handle == NULL) + { + errno = GetLastError(); + return -1; + } + return 0; +} + +/* sem_destroy */ +/* + * sem_destroy + * @param sem the semaphore to destroy + * @return 0 on success, -1 on failure + */ +static inline int sem_destroy(sem_t *sem) +{ + if (sem->handle != NULL) + { + CloseHandle(sem->handle); + sem->handle = NULL; + } + return 0; +} + +/* sem_wait */ +/* + * sem_wait + * @param sem the semaphore to wait on + * @return 0 on success, -1 on failure + */ +static inline int sem_wait(sem_t *sem) +{ + DWORD result = WaitForSingleObject(sem->handle, INFINITE); + return (result == WAIT_OBJECT_0) ? 0 : -1; +} + +/* sem_post */ +/* + * sem_post + * @param sem the semaphore to post + * @return 0 on success, -1 on failure + */ +static inline int sem_post(sem_t *sem) +{ + return ReleaseSemaphore(sem->handle, 1, NULL) ? 0 : -1; +} + +/* file operations macros for cross-platform compatibility */ +#ifndef S_ISDIR +#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR) +#endif +#define sleep(seconds) Sleep((seconds)*1000) +#define usleep(microseconds) Sleep((microseconds) / 1000) /* usleep for Windows */ +#define access _access +#define ftell _ftelli64 +#define fseek _fseeki64 + +/* fopen wrapper for windows */ +/* + * tdb_fopen + * @param filename the filename to open + * @param mode the mode to open the file in + * @return a pointer to the opened file, or NULL on failure + */ +static inline FILE *tdb_fopen(const char *filename, const char *mode) +{ + return _fsopen(filename, mode, _SH_DENYNO); +} +#define fopen tdb_fopen + +/* fsync for windows */ +/* + * fsync + * @param fd the file descriptor to sync + * @return 0 on success, -1 on failure + */ +static inline int fsync(int fd) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return -1; + } + if (!FlushFileBuffers(h)) + { + errno = GetLastError(); + return -1; + } + return 0; +} + +/* fdatasync for MSVC, same as fsync (windows doesn't distinguish) */ +/* + * fdatasync + * @param fd the file descriptor to sync + * @return 0 on success, -1 on failure + */ +static inline int fdatasync(int fd) +{ + return fsync(fd); +} + +/* clock_gettime for MSVC */ +/* + * clock_gettime + * @param clk_id the clock ID (unused) + * @param tp the timespec struct to fill + * @return 0 on success, -1 on failure + */ +static inline int clock_gettime(int clk_id, struct timespec *tp) +{ + (void)clk_id; + FILETIME ft; + ULARGE_INTEGER ui; + + GetSystemTimeAsFileTime(&ft); + ui.LowPart = ft.dwLowDateTime; + ui.HighPart = ft.dwHighDateTime; + + /* convert 100-nanosecond intervals to seconds and nanoseconds */ + tp->tv_sec = (long)((ui.QuadPart - 116444736000000000ULL) / 10000000ULL); + tp->tv_nsec = (long)((ui.QuadPart % 10000000ULL) * 100); + + return 0; +} + +/* gettimeofday for MSVC */ +/* + * gettimeofday + * @param tp the timeval struct to fill + * @param tzp the timezone struct (unused) + * @return 0 on success, -1 on failure + */ +static inline int gettimeofday(struct timeval *tp, struct timezone *tzp) +{ + (void)tzp; + FILETIME ft; + ULARGE_INTEGER ui; + + GetSystemTimeAsFileTime(&ft); + ui.LowPart = ft.dwLowDateTime; + ui.HighPart = ft.dwHighDateTime; + + /* convert to microseconds */ + tp->tv_sec = (long)((ui.QuadPart - 116444736000000000ULL) / 10000000ULL); + tp->tv_usec = (long)((ui.QuadPart % 10000000ULL) / 10); + + return 0; +} + +/* pread/pwrite for MSVC using OVERLAPPED + */ +/* + * pread + * reads data from a file descriptor at a specific offset + * @param fd the file descriptor to read from + * @param buf the buffer to read into + * @param count the number of bytes to read + * @param offset the offset to read from + * @return the number of bytes read, or -1 on error + */ +static inline ssize_t pread(int fd, void *buf, size_t count, off_t offset) +{ + if (count == 0) + { + return 0; /* reading 0 bytes is valid, returns 0 */ + } + + if (!buf) + { + errno = EINVAL; + return -1; + } + + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return -1; + } + + OVERLAPPED overlapped; + ZeroMemory(&overlapped, sizeof(OVERLAPPED)); + + LARGE_INTEGER li; + li.QuadPart = offset; + overlapped.Offset = li.LowPart; + overlapped.OffsetHigh = li.HighPart; + + overlapped.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (overlapped.hEvent == NULL) + { + errno = GetLastError(); + return -1; + } + + DWORD bytes_read = 0; + BOOL result = ReadFile(h, buf, (DWORD)count, &bytes_read, &overlapped); + + if (!result) + { + DWORD err = GetLastError(); + if (err == ERROR_IO_PENDING) + { + if (!GetOverlappedResult(h, &overlapped, &bytes_read, TRUE)) + { + CloseHandle(overlapped.hEvent); + errno = GetLastError(); + return -1; + } + } + else + { + CloseHandle(overlapped.hEvent); + errno = err; + return -1; + } + } + + CloseHandle(overlapped.hEvent); + return (ssize_t)bytes_read; +} + +/* + * pwrite + * writes data to a file descriptor at a specific offset + * @param fd the file descriptor to write to + * @param buf the buffer to write from + * @param count the number of bytes to write + * @param offset the offset to write at + * @return the number of bytes written, or -1 on error + */ +static inline ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + if (count == 0) + { + return 0; /* writing 0 bytes is valid, returns 0 */ + } + + if (!buf) + { + errno = EINVAL; + return -1; + } + + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return -1; + } + + OVERLAPPED overlapped; + ZeroMemory(&overlapped, sizeof(OVERLAPPED)); + + LARGE_INTEGER li; + li.QuadPart = offset; + overlapped.Offset = li.LowPart; + overlapped.OffsetHigh = li.HighPart; + + overlapped.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (overlapped.hEvent == NULL) + { + errno = GetLastError(); + return -1; + } + + DWORD bytes_written = 0; + BOOL result = WriteFile(h, buf, (DWORD)count, &bytes_written, &overlapped); + + if (!result) + { + DWORD err = GetLastError(); + if (err == ERROR_IO_PENDING) + { + if (!GetOverlappedResult(h, &overlapped, &bytes_written, TRUE)) + { + CloseHandle(overlapped.hEvent); + errno = GetLastError(); + return -1; + } + } + else + { + CloseHandle(overlapped.hEvent); + errno = err; + return -1; + } + } + + CloseHandle(overlapped.hEvent); + return (ssize_t)bytes_written; +} +#endif /* _MSC_VER */ + +/* fileno for all Windows (MSVC and MinGW) */ +/* + * tdb_fileno + * portable file descriptor extraction from FILE* + * @param stream the FILE* to get descriptor from + * @return file descriptor, or -1 on failure + */ +static inline int tdb_fileno(FILE *stream) +{ + if (!stream) return -1; + return _fileno(stream); +} + +#if defined(__MINGW32__) || defined(__MINGW64__) +/* fopen for MinGW (uses standard fopen, not fopen_s) */ +/* + * tdb_fopen + * portable file opening wrapper + * @param filename the filename to open + * @param mode the mode to open the file in + * @return a pointer to the opened file, or NULL on failure + */ +static inline FILE *tdb_fopen(const char *filename, const char *mode) +{ + return fopen(filename, mode); +} +#endif + +#if defined(__MINGW32__) || defined(__MINGW64__) +/* mingw provides semaphore.h for POSIX semaphores */ +#include + +/* mingw doesn't provide pread/pwrite/fdatasync, so we implement them */ +/* + * pread + * reads data from a file descriptor at a specific offset + * @param fd the file descriptor to read from + * @param buf the buffer to read into + * @param count the number of bytes to read + * @param offset the offset to read from + * @return the number of bytes read, or -1 on error + */ +static inline ssize_t pread(int fd, void *buf, size_t count, off_t offset) +{ + if (count == 0) + { + return 0; /* reading 0 bytes is valid, returns 0 */ + } + + if (!buf) + { + errno = EINVAL; + return -1; + } + + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return -1; + } + + OVERLAPPED overlapped = {0}; + LARGE_INTEGER li; + li.QuadPart = offset; + overlapped.Offset = li.LowPart; + overlapped.OffsetHigh = li.HighPart; + + overlapped.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (overlapped.hEvent == NULL) + { + errno = GetLastError(); + return -1; + } + + DWORD bytes_read = 0; + BOOL result = ReadFile(h, buf, (DWORD)count, &bytes_read, &overlapped); + + if (!result) + { + DWORD err = GetLastError(); + if (err == ERROR_IO_PENDING) + { + if (!GetOverlappedResult(h, &overlapped, &bytes_read, TRUE)) + { + CloseHandle(overlapped.hEvent); + errno = GetLastError(); + return -1; + } + } + else + { + CloseHandle(overlapped.hEvent); + errno = err; + return -1; + } + } + + CloseHandle(overlapped.hEvent); + return (ssize_t)bytes_read; +} + +/* + * pwrite + * writes data to a file descriptor at a specific offset + * @param fd the file descriptor to write to + * @param buf the buffer to write from + * @param count the number of bytes to write + * @param offset the offset to write at + * @return the number of bytes written, or -1 on error + */ +static inline ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + if (count == 0) + { + return 0; /* writing 0 bytes is valid, returns 0 */ + } + + if (!buf) + { + errno = EINVAL; + return -1; + } + + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return -1; + } + + OVERLAPPED overlapped = {0}; + LARGE_INTEGER li; + li.QuadPart = offset; + overlapped.Offset = li.LowPart; + overlapped.OffsetHigh = li.HighPart; + + overlapped.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (overlapped.hEvent == NULL) + { + errno = GetLastError(); + return -1; + } + + DWORD bytes_written = 0; + BOOL result = WriteFile(h, buf, (DWORD)count, &bytes_written, &overlapped); + + if (!result) + { + DWORD err = GetLastError(); + if (err == ERROR_IO_PENDING) + { + if (!GetOverlappedResult(h, &overlapped, &bytes_written, TRUE)) + { + CloseHandle(overlapped.hEvent); + errno = GetLastError(); + return -1; + } + } + else + { + CloseHandle(overlapped.hEvent); + errno = err; + return -1; + } + } + + CloseHandle(overlapped.hEvent); + return (ssize_t)bytes_written; +} + +/* + * fsync + * synchronizes file data to disk + * @param fd the file descriptor to synchronize + * @return 0 if successful, -1 otherwise + */ +static inline int fsync(int fd) +{ + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) + { + return -1; + } + return FlushFileBuffers(h) ? 0 : -1; +} + +/* + * fdatasync + * synchronizes file data to disk + * @param fd the file descriptor to synchronize + * @return 0 if successful, -1 otherwise + */ +static inline int fdatasync(int fd) +{ + return fsync(fd); +} +#endif /* __MINGW32__ || __MINGW64__ */ + +#elif defined(__APPLE__) +#include +#include +#include +#include +#include +#include +#include +#include + +/* Grand Central Dispatch (dispatch/dispatch.h) is only available on macOS 10.6+ + * For older macOS versions (e.g., 10.5 PPC64), use POSIX semaphores instead */ +#include +#if MAC_OS_X_VERSION_MIN_REQUIRED >= 1060 +#define TDB_USE_DISPATCH_SEMAPHORE 1 +#include +#else +#define TDB_USE_DISPATCH_SEMAPHORE 0 +#include +#endif + +/* pread and pwrite are available natively on macOS via unistd.h */ +/* no additional implementation needed using system pread/pwrite */ + +/** + * tdb_fopen + * portable file opening wrapper + * @param filename the filename to open + * @param mode the mode to open the file in + * @return a pointer to the opened file, or NULL on failure + */ +static inline FILE *tdb_fopen(const char *filename, const char *mode) +{ + return fopen(filename, mode); +} + +/** + * tdb_fileno + * portable file descriptor extraction from FILE* + * @param stream the FILE* to get descriptor from + * @return file descriptor, or -1 on failure + */ +static inline int tdb_fileno(FILE *stream) +{ + if (!stream) return -1; + return fileno(stream); +} + +/* + * fdatasync + * synchronizes file data to disk + * @param fd the file descriptor to synchronize + * @return 0 if successful, -1 otherwise + */ +static inline int fdatasync(int fd) +{ +#ifdef F_FULLFSYNC + /* macOS requires F_FULLFSYNC to actually flush to disk */ + if (fcntl(fd, F_FULLFSYNC) == -1) + { + /* fall back to fsync if F_FULLFSYNC fails */ + return fsync(fd); + } + return 0; +#else + /* fall back to fsync if F_FULLFSYNC not available */ + return fsync(fd); +#endif +} + +#if TDB_USE_DISPATCH_SEMAPHORE +/* semaphore compatibility for macOS 10.6+ using Grand Central Dispatch + * macOS deprecated POSIX semaphores (sem_init, sem_destroy, etc.) + * use dispatch_semaphore instead */ +typedef dispatch_semaphore_t sem_t; + +/* + * sem_init + * initializes a semaphore + * @param sem the semaphore to initialize + * @param pshared whether the semaphore is shared between processes + * @param value the initial value of the semaphore + * @return 0 if successful, -1 otherwise + */ +static inline int sem_init(sem_t *sem, int pshared, unsigned int value) +{ + (void)pshared; /* unused on macOS */ + *sem = dispatch_semaphore_create(value); + return (*sem == NULL) ? -1 : 0; +} + +/* + * sem_destroy + * destroys a semaphore + * @param sem the semaphore to destroy + * @return 0 if successful, -1 otherwise + */ +static inline int sem_destroy(sem_t *sem) +{ + if (*sem) + { + dispatch_release(*sem); + *sem = NULL; + } + return 0; +} + +/* + * sem_wait + * waits on a semaphore + * @param sem the semaphore to wait on + * @return 0 if successful, -1 otherwise + */ +static inline int sem_wait(sem_t *sem) +{ + return (dispatch_semaphore_wait(*sem, DISPATCH_TIME_FOREVER) == 0) ? 0 : -1; +} + +/* + * sem_post + * posts a semaphore + * @param sem the semaphore to post + * @return 0 if successful, -1 otherwise + */ +static inline int sem_post(sem_t *sem) +{ + dispatch_semaphore_signal(*sem); + return 0; +} +#else +/* for macOS < 10.6 (e.g., 10.5 PPC64), use POSIX semaphores + * note-- POSIX semaphores are deprecated on modern macOS but work on older versions */ +/* sem_t, sem_init, sem_destroy, sem_wait, sem_post are provided by semaphore.h */ +#endif + +#else /* posix systems */ +#include +#include +#include +#include +#include +#include +#include + +/* + * tdb_fopen + * @param filename the filename to open + * @param mode the mode to open the file in + * @return a pointer to the opened file, or NULL on failure + */ +static inline FILE *tdb_fopen(const char *filename, const char *mode) +{ + return fopen(filename, mode); +} + +/** + * tdb_fileno + * portable file descriptor extraction from FILE* + * @param stream the FILE* to get descriptor from + * @return file descriptor, or -1 on failure + */ +static inline int tdb_fileno(FILE *stream) +{ + if (!stream) return -1; + return fileno(stream); +} + +/* sysinfo is Linux-specific, BSD uses sysctl */ +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) || defined(__DragonFly__) +#include +#include +#elif defined(__OpenBSD__) || defined(__NetBSD__) +#include +#include +#include +#endif + +/* pread, pwrite, and fdatasync are available natively on POSIX systems via unistd.h */ +/* no additional implementation needed using system pread/pwrite/fdatasync */ + +typedef pthread_t thread_t; +typedef pthread_mutex_t mutex_t; +typedef pthread_cond_t cond_t; +typedef pthread_mutex_t crit_section_t; +typedef pthread_rwlock_t rwlock_t; +#endif + +/* cross-platform thread naming + * Linux -- prctl(PR_SET_NAME) -- 16 char limit including null + * macOS -- pthread_setname_np(name) -- only current thread, 1 arg + * FreeBSD/DragonFly -- pthread_setname_np(thread, name) -- 2 args + * NetBSD -- pthread_setname_np(thread, fmt, arg) -- 3 args, printf-style + * OpenBSD -- pthread_set_name_np(thread, name) + * Windows MSVC -- SetThreadDescription (Win10 1607+) + * Windows MinGW -- no-op fallback */ +#if defined(__linux__) +#include +#endif +static inline void tdb_set_thread_name(const char *name) +{ + if (!name) return; +#if defined(__linux__) + prctl(PR_SET_NAME, (unsigned long)name, 0, 0, 0); +#elif defined(__APPLE__) + pthread_setname_np(name); +#elif defined(__FreeBSD__) || defined(__DragonFly__) + pthread_setname_np(pthread_self(), name); +#elif defined(__NetBSD__) + pthread_setname_np(pthread_self(), "%s", (void *)name); +#elif defined(__OpenBSD__) + pthread_set_name_np(pthread_self(), name); +#elif defined(_MSC_VER) + /* SetThreadDescription requires wide string */ + wchar_t wname[64]; + size_t i; + for (i = 0; i < 63 && name[i]; i++) wname[i] = (wchar_t)name[i]; + wname[i] = L'\0'; + SetThreadDescription(GetCurrentThread(), wname); +#else + (void)name; /* no-op fallback */ +#endif +} + +/* O_DSYNC/O_SYNC for synchronous writes (must be after all platform includes) + * POSIX -- O_DSYNC syncs data only, O_SYNC syncs data + metadata + * windows -- no direct equivalent at open() time, use fdatasync() per-write + * some BSDs (DragonFlyBSD, older FreeBSD) may not define O_DSYNC */ +#ifndef O_DSYNC +#ifdef _WIN32 +#define O_DSYNC 0 /* no O_DSYNC, will use fdatasync() fallback */ +#elif defined(__APPLE__) +#define O_DSYNC 0x400000 /* macOS -- O_DSYNC = 0x400000 */ +#else +#define O_DSYNC 0 /* fallback for BSDs and others without O_DSYNC */ +#endif +#endif + +/* cross-platform pwritev for scatter-gather I/O + * Linux and modern BSDs have native pwritev in + * macOS added pwritev in 10.16/11.0 (Big Sur) + * older macOS and Windows fall back to sequential pwrite calls */ +#ifdef _WIN32 +struct iovec +{ + void *iov_base; + size_t iov_len; +}; +#define TDB_NEED_PWRITEV_FALLBACK 1 +#else +#include +/* macOS < 11.0 does not have pwritev. MAC_OS_X_VERSION_10_16 == 101600 == Big Sur. + * check for the availability macro; if it does not exist, assume the platform is old enough + * to lack pwritev. */ +#if defined(__APPLE__) +#include +#if !defined(MAC_OS_X_VERSION_10_16) || MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_16 +#define TDB_NEED_PWRITEV_FALLBACK 1 +#endif +#endif +#endif + +#ifdef TDB_NEED_PWRITEV_FALLBACK +/* + * pwritev + * scatter-gather write at offset (fallback using sequential pwrite) + * @param fd the file descriptor + * @param iov array of iovec buffers + * @param iovcnt number of iovec entries + * @param offset the file offset to write at + * @return total bytes written, or -1 on error + */ +static inline ssize_t tdb_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + ssize_t total = 0; + for (int i = 0; i < iovcnt; i++) + { + ssize_t n = pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + if (n != (ssize_t)iov[i].iov_len) return (total > 0) ? total : -1; + total += n; + offset += n; + } + return total; +} +#define pwritev tdb_pwritev +#endif + +/** + * tdb_pwritev_safe + * wrapper around pwritev that blocks SIGALRM/SIGVTALRM/SIGPROF for the duration + * of the syscall. prevents EINTR from leaving a zero-filled hole in the file when + * the atomic offset reservation has already been committed. + * @param fd the file descriptor + * @param iov array of iovec buffers + * @param iovcnt number of iovec entries + * @param offset the file offset to write at + * @return total bytes written, or -1 on error + */ +#if defined(__GNUC__) || defined(__clang__) +__attribute__((unused)) +#endif +static ssize_t +tdb_pwritev_safe(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ +#ifndef _WIN32 + sigset_t block_set, old_set; + sigemptyset(&block_set); + sigaddset(&block_set, SIGALRM); + sigaddset(&block_set, SIGVTALRM); + sigaddset(&block_set, SIGPROF); + pthread_sigmask(SIG_BLOCK, &block_set, &old_set); + const ssize_t written = pwritev(fd, iov, iovcnt, offset); + pthread_sigmask(SIG_SETMASK, &old_set, NULL); + return written; +#else + return pwritev(fd, iov, iovcnt, offset); +#endif +} + +/* atomic compare exchange for pointers (all platforms with C11 atomics) */ +#if !defined(_MSC_VER) || _MSC_VER >= 1930 +/* + * atomic_compare_exchange_strong_ptr + * @param ptr pointer to atomic pointer + * @param expected pointer to expected value + * @param desired new value to store + * @return 1 if successful, 0 if failed + */ +static inline int atomic_compare_exchange_strong_ptr(_Atomic(void *) *ptr, void **expected, + void *desired) +{ + return atomic_compare_exchange_strong(ptr, expected, desired); +} +#endif + +/* + * get_available_memory + * gets available system memory in bytes + * @return available memory in bytes, or 0 on failure + */ +static inline size_t get_available_memory(void) +{ +#ifdef _WIN32 + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + if (GlobalMemoryStatusEx(&status)) + { + return (size_t)status.ullAvailPhys; + } + return 0; +#elif defined(__APPLE__) + vm_size_t page_size; + mach_port_t mach_port; + mach_msg_type_number_t count; + + mach_port = mach_host_self(); + + /* 32-bit vm statistics on PPC regardless of OS version. + * host_statistics64 is not available on 10.5 and for PPC 32-bit even on 10.6 */ +#if defined(__ppc__) || (MAC_OS_X_VERSION_MIN_REQUIRED < 1060) + /* PPC always uses 32-bit vm statistics */ + vm_statistics_data_t vm_stats; + count = HOST_VM_INFO_COUNT; + if (host_page_size(mach_port, &page_size) == KERN_SUCCESS && + host_statistics(mach_port, HOST_VM_INFO, (host_info_t)&vm_stats, &count) == KERN_SUCCESS) + { + return (size_t)((vm_stats.free_count + vm_stats.inactive_count + vm_stats.purgeable_count) * + page_size); + } +#else + /* try 64-bit first (macOS 10.6+ on x86/x86_64/ARM), fall back to 32-bit */ + vm_statistics64_data_t vm_stats64; + count = sizeof(vm_stats64) / sizeof(natural_t); + if (host_page_size(mach_port, &page_size) == KERN_SUCCESS && + host_statistics64(mach_port, HOST_VM_INFO, (host_info64_t)&vm_stats64, &count) == + KERN_SUCCESS) + { + return (size_t)((vm_stats64.free_count + vm_stats64.inactive_count + + vm_stats64.purgeable_count) * + page_size); + } + else + { + /* fallback to 32-bit for older systems or Rosetta edge cases */ + vm_statistics_data_t vm_stats; + count = HOST_VM_INFO_COUNT; + if (host_page_size(mach_port, &page_size) == KERN_SUCCESS && + host_statistics(mach_port, HOST_VM_INFO, (host_info_t)&vm_stats, &count) == + KERN_SUCCESS) + { + return ( + size_t)((vm_stats.free_count + vm_stats.inactive_count + vm_stats.purgeable_count) * + page_size); + } + } +#endif + return 0; +#elif defined(__linux__) + /* prefer /proc/meminfo MemAvailable -- the kernel's own estimate of memory + * available for new allocations without swapping (includes free + reclaimable + * buffers/cache + reclaimable slab). sysinfo.freeram only reports truly free + * pages which is typically very low on a busy system and triggers false + * critical memory pressure */ + { + FILE *f = fopen("/proc/meminfo", "r"); + if (f) + { + char line[256]; + while (fgets(line, sizeof(line), f)) + { + unsigned long long val; + if (sscanf(line, "MemAvailable: %llu kB", &val) == 1) + { + fclose(f); + return (size_t)(val * 1024ULL); + } + } + fclose(f); + } + } + /* fallback to sysinfo.freeram if /proc/meminfo is unavailable */ + { + struct sysinfo si; + if (sysinfo(&si) == 0) + { + return (size_t)si.freeram * (size_t)si.mem_unit; + } + } + return 0; +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) + /* BSD systems use sysctl.. */ + unsigned long free_pages = 0; + unsigned long page_size = 0; + size_t len = sizeof(free_pages); + +#if defined(__FreeBSD__) || defined(__DragonFly__) + if (sysctlbyname("vm.stats.vm.v_free_count", &free_pages, &len, NULL, 0) == 0) + { + len = sizeof(page_size); + if (sysctlbyname("vm.stats.vm.v_page_size", &page_size, &len, NULL, 0) == 0) + { + return (size_t)(free_pages * page_size); + } + } +#elif defined(__OpenBSD__) || defined(__NetBSD__) + int mib[2]; + struct uvmexp uvmexp; + len = sizeof(uvmexp); + + mib[0] = CTL_VM; + mib[1] = VM_UVMEXP; + if (sysctl(mib, 2, &uvmexp, &len, NULL, 0) == 0) + { + return (size_t)((uint64_t)uvmexp.free * (uint64_t)uvmexp.pagesize); + } +#endif + return 0; +#else + /* illumos/solaris and other POSIX systems + * note -- on 32-bit systems, multiplying pages * page_size can overflow + * so we cast to 64-bit before multiplication */ + long pages = sysconf(_SC_AVPHYS_PAGES); + long page_size = sysconf(_SC_PAGESIZE); + if (pages > 0 && page_size > 0) + { + return (size_t)((uint64_t)pages * (uint64_t)page_size); + } + return 0; +#endif +} + +/* + * get_total_memory + * gets total system memory in bytes + * @return total memory in bytes, or 0 on failure + */ +static inline size_t get_total_memory(void) +{ +#ifdef _WIN32 + MEMORYSTATUSEX status; + status.dwLength = sizeof(status); + if (GlobalMemoryStatusEx(&status)) + { + return (size_t)status.ullTotalPhys; + } + return 0; +#elif defined(__APPLE__) + int mib[2]; + int64_t physical_memory; + size_t length; + + mib[0] = CTL_HW; + mib[1] = HW_MEMSIZE; + length = sizeof(int64_t); + if (sysctl(mib, 2, &physical_memory, &length, NULL, 0) == 0) + { + return (size_t)physical_memory; + } + return 0; +#elif defined(__linux__) + struct sysinfo si; + if (sysinfo(&si) == 0) + { + return (size_t)si.totalram * (size_t)si.mem_unit; + } + return 0; +#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) + int mib[2]; + size_t physical_memory; + size_t len; + + mib[0] = CTL_HW; +#if defined(__OpenBSD__) || defined(__NetBSD__) + /* OpenBSD and NetBSD support HW_PHYSMEM64 for 64-bit physical memory */ + mib[1] = HW_PHYSMEM64; + int64_t physmem64; + len = sizeof(physmem64); + if (sysctl(mib, 2, &physmem64, &len, NULL, 0) == 0) + { + return (size_t)physmem64; + } +#else + /* FreeBSD and DragonFlyBSD use HW_PHYSMEM which returns size_t */ + mib[1] = HW_PHYSMEM; + len = sizeof(physical_memory); + if (sysctl(mib, 2, &physical_memory, &len, NULL, 0) == 0) + { + return physical_memory; + } +#endif + return 0; +#else + /* illumos/solaris and other POSIX systems + * note -- on 32-bit systems, multiplying pages * page_size can overflow + * so we cast to 64-bit before multiplication */ + long pages = sysconf(_SC_PHYS_PAGES); + long page_size = sysconf(_SC_PAGESIZE); + if (pages > 0 && page_size > 0) + { + return (size_t)((uint64_t)pages * (uint64_t)page_size); + } + return 0; +#endif +} + +/* + * get_file_mod_time + * gets the modified time of a file + * @param path the path of the file + * @return the modified time of the file, or -1 on failure + */ +static inline time_t get_file_mod_time(const char *path) +{ + struct STAT_STRUCT file_stat; + + if (STAT_FUNC(path, &file_stat) != 0) + { + return -1; + } + + return (time_t)file_stat.st_mtime; +} + +/* cross-platform little-endian serialization functions */ + +/* + * encode_uint16_le_compat + * encodes a uint16_t value in little-endian format + * @param buf buffer to store encoded value + * @param val value to encode + */ +static inline void encode_uint16_le_compat(uint8_t *buf, uint16_t val) +{ + buf[0] = (uint8_t)(val & 0xFF); + buf[1] = (uint8_t)((val >> 8) & 0xFF); +} + +/* + * decode_uint16_le_compat + * decodes a uint16_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline uint16_t decode_uint16_le_compat(const uint8_t *buf) +{ + return ((uint16_t)buf[0]) | ((uint16_t)buf[1] << 8); +} + +/* + * encode_uint32_le_compat + * encodes a uint32_t value in little-endian format + * @param buf buffer to store encoded value + * @param val value to encode + */ +static inline void encode_uint32_le_compat(uint8_t *buf, uint32_t val) +{ + buf[0] = (uint8_t)(val & 0xFF); + buf[1] = (uint8_t)((val >> 8) & 0xFF); + buf[2] = (uint8_t)((val >> 16) & 0xFF); + buf[3] = (uint8_t)((val >> 24) & 0xFF); +} + +/* + * decode_uint32_le_compat + * decodes a uint32_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline uint32_t decode_uint32_le_compat(const uint8_t *buf) +{ + return ((uint32_t)buf[0]) | ((uint32_t)buf[1] << 8) | ((uint32_t)buf[2] << 16) | + ((uint32_t)buf[3] << 24); +} + +/* + * encode_uint64_le_compat + * encodes a uint64_t value in little-endian format + * @param buf buffer to store encoded value + * @param val value to encode + */ +static inline void encode_uint64_le_compat(uint8_t *buf, uint64_t val) +{ + buf[0] = (uint8_t)(val & 0xFF); + buf[1] = (uint8_t)((val >> 8) & 0xFF); + buf[2] = (uint8_t)((val >> 16) & 0xFF); + buf[3] = (uint8_t)((val >> 24) & 0xFF); + buf[4] = (uint8_t)((val >> 32) & 0xFF); + buf[5] = (uint8_t)((val >> 40) & 0xFF); + buf[6] = (uint8_t)((val >> 48) & 0xFF); + buf[7] = (uint8_t)((val >> 56) & 0xFF); +} + +/* + * encode_uint32_le + * encodes a uint32_t value in little-endian format + * @param buf buffer to store encoded value + * @param val value to encode + */ +static inline void encode_uint32_le(uint8_t *buf, uint32_t val) +{ + buf[0] = (uint8_t)(val & 0xFF); + buf[1] = (uint8_t)((val >> 8) & 0xFF); + buf[2] = (uint8_t)((val >> 16) & 0xFF); + buf[3] = (uint8_t)((val >> 24) & 0xFF); +} + +/* + * decode_uint32_le + * decodes a uint32_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline uint32_t decode_uint32_le(const uint8_t *buf) +{ + return ((uint32_t)buf[0]) | ((uint32_t)buf[1] << 8) | ((uint32_t)buf[2] << 16) | + ((uint32_t)buf[3] << 24); +} + +/* + * encode_int64_le + * encodes an int64_t value in little-endian format + * @param buf buffer to store encoded value + * @param val value to encode + */ +static inline void encode_int64_le(uint8_t *buf, int64_t val) +{ + const uint64_t uval = (uint64_t)val; + buf[0] = (uint8_t)(uval & 0xFF); + buf[1] = (uint8_t)((uval >> 8) & 0xFF); + buf[2] = (uint8_t)((uval >> 16) & 0xFF); + buf[3] = (uint8_t)((uval >> 24) & 0xFF); + buf[4] = (uint8_t)((uval >> 32) & 0xFF); + buf[5] = (uint8_t)((uval >> 40) & 0xFF); + buf[6] = (uint8_t)((uval >> 48) & 0xFF); + buf[7] = (uint8_t)((uval >> 56) & 0xFF); +} + +/* + * decode_int64_le + * decodes an int64_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline int64_t decode_int64_le(const uint8_t *buf) +{ + const uint64_t uval = ((uint64_t)buf[0]) | ((uint64_t)buf[1] << 8) | ((uint64_t)buf[2] << 16) | + ((uint64_t)buf[3] << 24) | ((uint64_t)buf[4] << 32) | + ((uint64_t)buf[5] << 40) | ((uint64_t)buf[6] << 48) | + ((uint64_t)buf[7] << 56); + return (int64_t)uval; +} + +/* + * encode_uint64_le + * encodes a uint64_t value in little-endian format + * @param buf buffer to store encoded value + * @param val value to encode + */ +static inline void encode_uint64_le(uint8_t *buf, uint64_t val) +{ + buf[0] = (uint8_t)(val & 0xFF); + buf[1] = (uint8_t)((val >> 8) & 0xFF); + buf[2] = (uint8_t)((val >> 16) & 0xFF); + buf[3] = (uint8_t)((val >> 24) & 0xFF); + buf[4] = (uint8_t)((val >> 32) & 0xFF); + buf[5] = (uint8_t)((val >> 40) & 0xFF); + buf[6] = (uint8_t)((val >> 48) & 0xFF); + buf[7] = (uint8_t)((val >> 56) & 0xFF); +} + +/* + * decode_uint64_le + * decodes a uint64_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline uint64_t decode_uint64_le(const uint8_t *buf) +{ + return ((uint64_t)buf[0]) | ((uint64_t)buf[1] << 8) | ((uint64_t)buf[2] << 16) | + ((uint64_t)buf[3] << 24) | ((uint64_t)buf[4] << 32) | ((uint64_t)buf[5] << 40) | + ((uint64_t)buf[6] << 48) | ((uint64_t)buf[7] << 56); +} + +/* + * decode_fixed_32 + * decodes a uint32_t value in little-endian format + * @param data buffer containing encoded value + * @return decoded value + */ +static inline uint32_t decode_fixed_32(const char *data) +{ + return ((uint32_t)(uint8_t)data[0]) | ((uint32_t)(uint8_t)data[1] << 8) | + ((uint32_t)(uint8_t)data[2] << 16) | ((uint32_t)(uint8_t)data[3] << 24); +} + +/* + * decode_uint64_le_compat + * decodes a uint64_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline uint64_t decode_uint64_le_compat(const uint8_t *buf) +{ + return ((uint64_t)buf[0]) | ((uint64_t)buf[1] << 8) | ((uint64_t)buf[2] << 16) | + ((uint64_t)buf[3] << 24) | ((uint64_t)buf[4] << 32) | ((uint64_t)buf[5] << 40) | + ((uint64_t)buf[6] << 48) | ((uint64_t)buf[7] << 56); +} + +/** + * encode_int64_le_compat + * encodes a int64_t value in little-endian format + * @param buf output buffer (must be at least 8 bytes) + * @param val value to encode + */ +static inline void encode_int64_le_compat(uint8_t *buf, int64_t val) +{ + uint64_t uval = (uint64_t)val; + buf[0] = (uint8_t)(uval); + buf[1] = (uint8_t)(uval >> 8); + buf[2] = (uint8_t)(uval >> 16); + buf[3] = (uint8_t)(uval >> 24); + buf[4] = (uint8_t)(uval >> 32); + buf[5] = (uint8_t)(uval >> 40); + buf[6] = (uint8_t)(uval >> 48); + buf[7] = (uint8_t)(uval >> 56); +} + +/** + * decode_int64_le_compat + * decodes a int64_t value in little-endian format + * @param buf buffer containing encoded value + * @return decoded value + */ +static inline int64_t decode_int64_le_compat(const uint8_t *buf) +{ + uint64_t uval = ((uint64_t)buf[0]) | ((uint64_t)buf[1] << 8) | ((uint64_t)buf[2] << 16) | + ((uint64_t)buf[3] << 24) | ((uint64_t)buf[4] << 32) | ((uint64_t)buf[5] << 40) | + ((uint64_t)buf[6] << 48) | ((uint64_t)buf[7] << 56); + return (int64_t)uval; +} + +/* varint encoding/decoding for compact serialization */ +static inline uint8_t *encode_varint32(uint8_t *ptr, uint32_t value) +{ + while (value >= 0x80) + { + *ptr++ = (uint8_t)(value | 0x80); + value >>= 7; + } + *ptr++ = (uint8_t)value; + return ptr; +} + +static inline uint8_t *encode_varint64(uint8_t *ptr, uint64_t value) +{ + while (value >= 0x80) + { + *ptr++ = (uint8_t)(value | 0x80); + value >>= 7; + } + *ptr++ = (uint8_t)value; + return ptr; +} + +static inline const uint8_t *decode_varint32(const uint8_t *ptr, uint32_t *value) +{ + uint32_t result = 0; + int shift = 0; + while (*ptr & 0x80) + { + /* prevent shift overflow on corrupted data */ + if (shift >= 32) + { + *value = 0; + return ptr; + } + result |= (uint32_t)(*ptr & 0x7F) << shift; + shift += 7; + ptr++; + } + /* final byte check */ + if (shift >= 32) + { + *value = 0; + return ptr; + } + result |= (uint32_t)(*ptr) << shift; + *value = result; + return ptr + 1; +} + +static inline const uint8_t *decode_varint64(const uint8_t *ptr, uint64_t *value) +{ + uint64_t result = 0; + int shift = 0; + while (*ptr & 0x80) + { + /* prevent shift overflow on corrupted data */ + if (shift >= 64) + { + *value = 0; + return ptr; + } + result |= (uint64_t)(*ptr & 0x7F) << shift; + shift += 7; + ptr++; + } + /* final byte check */ + if (shift >= 64) + { + *value = 0; + return ptr; + } + result |= (uint64_t)(*ptr) << shift; + *value = result; + return ptr + 1; +} + +/* length-prefixed KV serialization helpers */ + +/* + * serialize_kv_varint + * serialize key-value pair with varint length prefixes + * format-- varint(key_size) + key + varint(value_size) + value + * @param ptr output buffer (must have enough space) + * @param key key data + * @param key_size key size + * @param value value data (can be NULL if value_size is 0) + * @param value_size value size + * @return pointer to end of written data + */ +static inline uint8_t *serialize_kv_varint(uint8_t *ptr, const uint8_t *key, uint32_t key_size, + const uint8_t *value, uint32_t value_size) +{ + /* write key size and key */ + ptr = encode_varint32(ptr, key_size); + memcpy(ptr, key, key_size); + ptr += key_size; + + /* write value size and value */ + ptr = encode_varint32(ptr, value_size); + if (value_size > 0 && value) + { + memcpy(ptr, value, value_size); + ptr += value_size; + } + + return ptr; +} + +/* + * serialize_kv_varint_ex + * serialize key-value pair with flags and varint length prefixes (for sstables) + * format is flags(1) + varint(key_size) + key + varint(value_size) + value + varint(ttl) + * @param ptr output buffer (must have enough space) + * @param flags flags byte (e.g., tombstone marker) + * @param key key data + * @param key_size key size + * @param value value data (can be NULL if value_size is 0) + * @param value_size value size + * @param ttl time-to-live (0 = no expiration) + * @return pointer to end of written data + */ +static inline uint8_t *serialize_kv_varint_ex(uint8_t *ptr, uint8_t flags, const uint8_t *key, + uint32_t key_size, const uint8_t *value, + uint32_t value_size, int64_t ttl) +{ + /* write flags */ + *ptr++ = flags; + + /* write key size and key */ + ptr = encode_varint32(ptr, key_size); + memcpy(ptr, key, key_size); + ptr += key_size; + + /* write value size and value */ + ptr = encode_varint32(ptr, value_size); + if (value_size > 0 && value) + { + memcpy(ptr, value, value_size); + ptr += value_size; + } + + /* write ttl */ + ptr = encode_varint64(ptr, (uint64_t)ttl); + + return ptr; +} + +/* + * serialize_kv_varint_full + * serialize key-value pair with all metadata (for WAL) + * format-- flags(1) + varint(key_size) + key + varint(value_size) + value + varint(ttl) + + * varint(seq) + * @param ptr output buffer (must have enough space) + * @param flags flags byte + * @param key key data + * @param key_size key size + * @param value value data (can be NULL if value_size is 0) + * @param value_size value size + * @param ttl time-to-live + * @param seq sequence number + * @return pointer to end of written data + */ +static inline uint8_t *serialize_kv_varint_full(uint8_t *ptr, uint8_t flags, const uint8_t *key, + uint32_t key_size, const uint8_t *value, + uint32_t value_size, int64_t ttl, uint64_t seq) +{ + /* write flags */ + *ptr++ = flags; + + /* write key size and key */ + ptr = encode_varint32(ptr, key_size); + memcpy(ptr, key, key_size); + ptr += key_size; + + /* write value size and value */ + ptr = encode_varint32(ptr, value_size); + if (value_size > 0 && value) + { + memcpy(ptr, value, value_size); + ptr += value_size; + } + + /* write ttl and seq */ + ptr = encode_varint64(ptr, (uint64_t)ttl); + ptr = encode_varint64(ptr, seq); + + return ptr; +} + +/* + * deserialize_kv_varint + * deserialize key-value pair with varint length prefixes + * @param ptr input buffer + * @param end end of input buffer (for bounds checking) + * @param key_size output key size + * @param value_size output value size + * @param key_out output pointer to key data (points into input buffer) + * @param value_out output pointer to value data (points into input buffer) + * @return pointer to next entry, or NULL on error + */ +static inline const uint8_t *deserialize_kv_varint(const uint8_t *ptr, const uint8_t *end, + uint32_t *key_size, uint32_t *value_size, + const uint8_t **key_out, + const uint8_t **value_out) +{ + /* read key size */ + if (ptr >= end) return NULL; + ptr = decode_varint32(ptr, key_size); + if (ptr + *key_size > end) return NULL; + + /* read key */ + *key_out = ptr; + ptr += *key_size; + + /* read value size */ + if (ptr >= end) return NULL; + ptr = decode_varint32(ptr, value_size); + if (ptr + *value_size > end) return NULL; + + /* read value */ + *value_out = ptr; + ptr += *value_size; + + return ptr; +} + +/* + * deserialize_kv_varint_ex + * deserialize key-value pair with flags and varint length prefixes (for sstables) + * @param ptr input buffer + * @param end end of input buffer (for bounds checking) + * @param flags output flags byte + * @param key_size output key size + * @param value_size output value size + * @param key_out output pointer to key data (points into input buffer) + * @param value_out output pointer to value data (points into input buffer) + * @param ttl output time-to-live + * @return pointer to next entry, or NULL on error + */ +static inline const uint8_t *deserialize_kv_varint_ex(const uint8_t *ptr, const uint8_t *end, + uint8_t *flags, uint32_t *key_size, + uint32_t *value_size, const uint8_t **key_out, + const uint8_t **value_out, int64_t *ttl) +{ + /* read flags */ + if (ptr >= end) return NULL; + *flags = *ptr++; + + /* read key size */ + if (ptr >= end) return NULL; + ptr = decode_varint32(ptr, key_size); + if (ptr + *key_size > end) return NULL; + + /* read key */ + *key_out = ptr; + ptr += *key_size; + + /* read value size */ + if (ptr >= end) return NULL; + ptr = decode_varint32(ptr, value_size); + if (ptr + *value_size > end) return NULL; + + /* read value */ + *value_out = ptr; + ptr += *value_size; + + /* read ttl */ + if (ptr >= end) return NULL; + uint64_t ttl_u64; + ptr = decode_varint64(ptr, &ttl_u64); + *ttl = (int64_t)ttl_u64; + + return ptr; +} + +/* + * deserialize_kv_varint_full + * deserialize key-value pair with all metadata (for WAL) + * @param ptr input buffer + * @param end end of input buffer (for bounds checking) + * @param flags output flags byte + * @param key_size output key size + * @param value_size output value size + * @param key_out output pointer to key data (points into input buffer) + * @param value_out output pointer to value data (points into input buffer) + * @param ttl output time-to-live + * @param seq output sequence number + * @return pointer to next entry, or NULL on error + */ +static inline const uint8_t *deserialize_kv_varint_full(const uint8_t *ptr, const uint8_t *end, + uint8_t *flags, uint32_t *key_size, + uint32_t *value_size, + const uint8_t **key_out, + const uint8_t **value_out, int64_t *ttl, + uint64_t *seq) +{ + /* read flags */ + if (ptr >= end) return NULL; + *flags = *ptr++; + + /* read key size */ + if (ptr >= end) return NULL; + ptr = decode_varint32(ptr, key_size); + if (ptr + *key_size > end) return NULL; + + /* read key */ + *key_out = ptr; + ptr += *key_size; + + /* read value size */ + if (ptr >= end) return NULL; + ptr = decode_varint32(ptr, value_size); + if (ptr + *value_size > end) return NULL; + + /* read value */ + *value_out = ptr; + ptr += *value_size; + + /* read ttl and seq */ + if (ptr >= end) return NULL; + uint64_t ttl_u64; + ptr = decode_varint64(ptr, &ttl_u64); + *ttl = (int64_t)ttl_u64; + + if (ptr >= end) return NULL; + ptr = decode_varint64(ptr, seq); + + return ptr; +} + +/* + * tdb_preallocate_extent + * extends the logical file size and reserves on-disk blocks for the new region + * ahead of writes, so that subsequent pwrites within the preallocated extent do + * not take the kernel's "write extends file" fast path. on Linux ext4 this + * avoids the per-inode i_rwsem write lock; equivalent locks exist on macOS APFS + * (vnode write lock) and Windows NTFS (file-extension lock). + * + * critical detail the logical EOF (i_size) MUST advance, not just the on-disk + * extent allocation. on Linux, fallocate(KEEP_SIZE) reserves blocks but leaves + * i_size unchanged, and the kernel still treats writes past i_size as extending + * writes -- delivering no speedup. mode 0 advances i_size and initializes the + * extents so subsequent pwrites are fully in-place. + * + * the trailing region is zero-filled. the caller must ftruncate back to the + * actual data extent on clean close so next-open validation isn't confused by + * trailing zeros. crash recovery should tolerate trailing zeros as preallocation + * tail (size_field == 0 marks the boundary between data and preallocated region). + * + * platform behavior: + * linux fallocate(fd, 0, off, len) -- advances i_size, initializes extents + * macos fcntl(F_PREALLOCATE) reserves, then ftruncate advances logical EOF + * windows SetFileInformationByHandle(FileAllocationInfo) reserves, then + * FileEndOfFileInfo advances EOF + * other posix posix_fallocate -- already advances EOF + * fallback returns -1, caller falls back to extending writes + * + * @param fd the file descriptor + * @param offset start of the region to preallocate (typically current EOF) + * @param len number of bytes to preallocate + * @return 0 on success, -1 on failure (non-fatal -- caller can continue) + */ +static inline int tdb_preallocate_extent(int fd, off_t offset, off_t len) +{ +#if defined(__linux__) + return fallocate(fd, 0, offset, len); +#elif defined(__APPLE__) + /* reserve blocks past current EOF (offset param is implicit on macOS) */ + (void)offset; + fstore_t fst; + fst.fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL; + fst.fst_posmode = F_PEOFPOSMODE; + fst.fst_offset = 0; + fst.fst_length = len; + fst.fst_bytesalloc = 0; + if (fcntl(fd, F_PREALLOCATE, &fst) == -1) + { + /* contiguous request failed, retry allowing fragmentation */ + fst.fst_flags = F_ALLOCATEALL; + if (fcntl(fd, F_PREALLOCATE, &fst) == -1) return -1; + } + /* advance logical EOF so writes within the new region don't take the + * extending-write lock */ + return ftruncate(fd, offset + len); +#elif defined(_WIN32) + HANDLE h = (HANDLE)_get_osfhandle(fd); + if (h == INVALID_HANDLE_VALUE) return -1; + FILE_ALLOCATION_INFO fai; + fai.AllocationSize.QuadPart = (LONGLONG)(offset + len); + if (!SetFileInformationByHandle(h, FileAllocationInfo, &fai, sizeof(fai))) return -1; + /* advance logical EOF -- otherwise NTFS still treats writes past EOF as extending */ + FILE_END_OF_FILE_INFO eofi; + eofi.EndOfFile.QuadPart = (LONGLONG)(offset + len); + return SetFileInformationByHandle(h, FileEndOfFileInfo, &eofi, sizeof(eofi)) ? 0 : -1; +#elif defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L + int rc = posix_fallocate(fd, offset, len); + return rc == 0 ? 0 : -1; +#else + (void)fd; + (void)offset; + (void)len; + return -1; +#endif +} + +/* + * set_file_sequential_hint + * hints to the OS that file access will be sequential for read-ahead optimization + * @param fd the file descriptor + * @return 0 on success, -1 on failure (non-critical, can be ignored) + */ +static inline int set_file_sequential_hint(int fd) +{ +#ifdef __linux__ + return posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#elif defined(__APPLE__) + return fcntl(fd, F_RDAHEAD, 1); +#elif defined(_WIN32) + /* _O_SEQUENTIAL flag set at open time via compat.h wrapper */ + (void)fd; /* unused on Windows */ + return 0; +#else + (void)fd; /* unused on other platforms */ + return 0; +#endif +} + +/* + * set_file_random_hint + * hints to the OS that file access will be random (disables read-ahead) + * useful for point lookups where sequential read-ahead wastes I/O + * @param fd the file descriptor + * @return 0 on success, -1 on failure (non-critical, can be ignored) + */ +static inline int set_file_random_hint(int fd) +{ +#ifdef __linux__ + return posix_fadvise(fd, 0, 0, POSIX_FADV_RANDOM); +#elif defined(__APPLE__) + return fcntl(fd, F_RDAHEAD, 0); +#elif defined(_WIN32) + /* _O_RANDOM flag would need to be set at open time + * for existing fd, we cant change this, so no-op */ + (void)fd; + return 0; +#else + (void)fd; + return 0; +#endif +} + +/* + * prefetch_file_region + * initiates non-blocking read of specified region into page cache + * useful when you know you'll need data soon (e.g., before decompression) + * @param fd the file descriptor + * @param offset starting offset to prefetch + * @param len number of bytes to prefetch (0 = until end of file) + * @return 0 on success, -1 on failure (non-critical, can be ignored) + */ +static inline int prefetch_file_region(int fd, off_t offset, off_t len) +{ +#ifdef __linux__ + return posix_fadvise(fd, offset, len, POSIX_FADV_WILLNEED); +#elif defined(__APPLE__) + /* on macos we utilize F_RDADVISE for read-ahead hint */ + struct radvisory ra; + ra.ra_offset = offset; + ra.ra_count = (int)(len > 0 ? len : (1024 * 1024)); /* default 1MB if len=0 */ + return fcntl(fd, F_RDADVISE, &ra); +#elif defined(_WIN32) + /* windows PrefetchVirtualMemory requires mapped memory + * for file-based prefetch, we do a small read to trigger caching on the system */ + (void)fd; + (void)offset; + (void)len; + return 0; +#else + (void)fd; + (void)offset; + (void)len; + return 0; +#endif +} + +/* + * evict_file_region + * hints to OS that specified region is no longer needed and can be evicted from cache + * useful after streaming reads (e.g., compaction) to prevent cache pollution + * call fsync/fdatasync first if dirty pages need to be written + * @param fd the file descriptor + * @param offset starting offset to evict + * @param len number of bytes to evict (0 = until end of file) + * @return 0 on success, -1 on failure (non-critical, can be ignored) + */ +static inline int evict_file_region(int fd, off_t offset, off_t len) +{ +#ifdef __linux__ + return posix_fadvise(fd, offset, len, POSIX_FADV_DONTNEED); +#elif defined(__APPLE__) + /* on macos F_NOCACHE disables caching for future I/O but doesn't evict + * theres no direct equivalent to POSIX_FADV_DONTNEED + * msync with MS_INVALIDATE on mmap'd regions is closest but requires mmap */ + (void)fd; + (void)offset; + (void)len; + return 0; +#elif defined(_WIN32) + /* no direct equivalent without memory mapping + * FILE_FLAG_NO_BUFFERING at open time is closest but requires alignment */ + (void)fd; + (void)offset; + (void)len; + return 0; +#else + (void)fd; + (void)offset; + (void)len; + return 0; +#endif +} + +/* + * set_file_noreuse_hint + * hints that specified region will be accessed only once (streaming) + * kernel page replacement can deprioritize these pages + * effective on Linux 6.3+ (was no-op from 2.6.18 to 6.2) + * @param fd the file descriptor + * @param offset starting offset + * @param len number of bytes (0 = until end of file) + * @return 0 on success, -1 on failure (non-critical, can be ignored) + */ +static inline int set_file_noreuse_hint(int fd, off_t offset, off_t len) +{ +#ifdef __linux__ + return posix_fadvise(fd, offset, len, POSIX_FADV_NOREUSE); +#elif defined(__APPLE__) + /* F_NOCACHE is similar -- tells system not to cache I/O + * this affects all future I/O on this fd, not just a region */ + (void)offset; + (void)len; + return fcntl(fd, F_NOCACHE, 1); +#elif defined(_WIN32) + /** FILE_FLAG_SEQUENTIAL_SCAN at open time is closest + * for existing fd, no equivalent */ + (void)fd; + (void)offset; + (void)len; + return 0; +#else + (void)fd; + (void)offset; + (void)len; + return 0; +#endif +} + +/** + * tdb_get_available_disk_space + * get available disk space for a given path + * @param path the path to check + * @param available pointer to store available bytes + * @return 0 on success, -1 on failure + */ +static inline int tdb_get_available_disk_space(const char *path, uint64_t *available) +{ + if (!path || !available) return -1; + +#if defined(_WIN32) + ULARGE_INTEGER free_bytes; + if (GetDiskFreeSpaceExA(path, &free_bytes, NULL, NULL)) + { + *available = (uint64_t)free_bytes.QuadPart; + return 0; + } + return -1; +#else + struct statvfs stat; + if (statvfs(path, &stat) == 0) + { + *available = (uint64_t)stat.f_bavail * (uint64_t)stat.f_frsize; + return 0; + } + return -1; +#endif +} + +/* cpu pause for spin-wait loops */ +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +#ifdef _MSC_VER +#include +#define cpu_pause() _mm_pause() +#else +#define cpu_pause() __builtin_ia32_pause() +#endif +#elif defined(__aarch64__) || defined(_M_ARM64) +#ifdef _MSC_VER +#include +#define cpu_pause() __yield() +#else +#define cpu_pause() __asm__ __volatile__("yield" ::: "memory") +#endif +#elif defined(__arm__) || defined(_M_ARM) +#ifdef _MSC_VER +#include +#define cpu_pause() __yield() +#else +#define cpu_pause() __asm__ __volatile__("yield" ::: "memory") +#endif +#else +#define cpu_pause() ((void)0) +#endif + +/* cpu yield for longer waits -- gives up time slice to scheduler */ +#ifdef _WIN32 +#include +#define cpu_yield() SwitchToThread() +#else +#include +#define cpu_yield() sched_yield() +#endif + +/* + * tdb_hardlink + * portable hard link creation + * @param src existing file path + * @param dst new hard link path + * @return 0 on success, -1 on failure + */ +static inline int tdb_hardlink(const char *src, const char *dst) +{ + if (!src || !dst) return -1; +#ifdef _WIN32 + return CreateHardLinkA(dst, src, NULL) ? 0 : -1; +#else + return link(src, dst); +#endif +} + +/* + * tdb_unlink + * portable file deletion + * @param path the file path to delete + * @return 0 on success, -1 on failure + */ +static inline int tdb_unlink(const char *path) +{ + if (!path) return -1; +#ifdef _WIN32 + /* clear read-only attribute that might prevent deletion */ + SetFileAttributesA(path, FILE_ATTRIBUTE_NORMAL); + return _unlink(path); +#else + return unlink(path); +#endif +} + +/** + * is_directory_empty + * checks if a directory is empty (contains only . and ..) + * @param path the directory path to check + * @return 1 if empty, 0 if not empty or error + */ +static inline int is_directory_empty(const char *path) +{ + DIR *dir = opendir(path); + if (!dir) return 0; + + struct dirent *entry; + int count = 0; + + while ((entry = readdir(dir)) != NULL) + { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; + count++; + break; /* found at least one entry */ + } + + closedir(dir); + return count == 0; +} + +/** + * remove_directory_once + * single pass of recursive directory removal + * @param path the directory path to remove + * @return 0 on success, -1 on failure + */ +static inline int remove_directory_once(const char *path) +{ + DIR *dir = opendir(path); + if (!dir) return -1; + + struct dirent *entry; + int result = 0; + + while ((entry = readdir(dir)) != NULL) + { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; + + size_t len = strlen(path) + strlen(PATH_SEPARATOR) + strlen(entry->d_name) + 1; + char *full_path = malloc(len); + if (!full_path) + { + result = -1; + continue; + } + + snprintf(full_path, len, "%s%s%s", path, PATH_SEPARATOR, entry->d_name); + + struct STAT_STRUCT st; + if (STAT_FUNC(full_path, &st) == 0) + { + if (S_ISDIR(st.st_mode)) + { + /* recursive call for subdirectory */ + if (remove_directory_once(full_path) != 0) result = -1; + } + else + { +#ifdef _WIN32 + /* clear read-only and other attributes that might prevent deletion */ + SetFileAttributesA(full_path, FILE_ATTRIBUTE_NORMAL); + if (_unlink(full_path) != 0) result = -1; +#else + if (unlink(full_path) != 0) result = -1; +#endif + } + } + + free(full_path); + } + + closedir(dir); + + /* we try to remove the directory itself */ +#ifdef _WIN32 + if (_rmdir(path) != 0) result = -1; +#else + if (rmdir(path) != 0) result = -1; +#endif + + return result; +} + +/** + * remove_directory + * recursively removes a directory and all its contents with retry logic + * retries if directory is not empty after deletion attempt (handles file locking) + * @param path the directory path to remove + * @return 0 on success, -1 on failure + */ +static inline int remove_directory(const char *path) +{ + DIR *dir = opendir(path); + if (!dir) return 0; /* already gone, success */ + closedir(dir); + + /* try up to 16 times with fixed 128ms delay */ + for (int attempt = 0; attempt < 16; attempt++) + { + /* attempt removal */ + (void)remove_directory_once(path); + + /* check if directory is gone or empty */ + dir = opendir(path); + if (!dir) + { + /* directory successfully removed */ + return 0; + } + + /* directory still exists, check if empty */ + if (is_directory_empty(path)) + { + closedir(dir); + /* empty but not removed, try rmdir directly */ +#ifdef _WIN32 + if (_rmdir(path) == 0) return 0; +#else + if (rmdir(path) == 0) return 0; +#endif + } + else + { + closedir(dir); + } + + /* directory not empty or removal failed, wait and retry */ + if (attempt < 15) + { +#ifdef _WIN32 + Sleep(128); +#else + usleep(128000); +#endif + } + } + + dir = opendir(path); + if (!dir) return 0; /* success */ + closedir(dir); + return -1; /* failed after all retries */ +} + +/** + * tdb_sync_directory + * syncs a directory to ensure directory entries (new files/subdirs) are persisted + * on POSIX systems, directory entries must be explicitly synced after mkdir/file creation + * on Windows, directory entries are immediately durable, so this is a no-op + * @param dir_path path to the directory to sync + * @return 0 on success, -1 on error (errors are non-fatal, just logged) + */ +static inline int tdb_sync_directory(const char *dir_path) +{ +#ifdef _WIN32 + /* Windows -- directory entries are immediately durable, no sync needed */ + (void)dir_path; + return 0; +#else + /* POSIX -- must fsync directory to persist directory entries */ + const int fd = open(dir_path, O_RDONLY); + if (fd < 0) + { + /* non-fatal -- directory might not support fsync (e.g., some network filesystems) */ + return -1; + } + const int result = fsync(fd); + close(fd); + return result; +#endif +} + +/** + * atomic_rename_file + * atomically renames a file from old_path to new_path + * on POSIX systems, rename() is atomic and replaces existing files + * on windows, rename() fails if target exists, so we remove it first + * @param old_path the current path of the file + * @param new_path the new path for the file + * @return 0 on success, -1 on failure + */ +static inline int atomic_rename_file(const char *old_path, const char *new_path) +{ + if (!old_path || !new_path) return -1; + +#ifdef _WIN32 + /* MoveFileEx with MOVEFILE_REPLACE_EXISTING for atomic rename on Windows + * this is truly atomic and replaces the target file if it exists */ + if (!MoveFileEx(old_path, new_path, MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH)) + { + errno = GetLastError(); + return -1; + } + + /* flush parent directory to ensure rename is durable + * extract directory from new_path */ + char dir_path[4096]; + const char *last_sep = strrchr(new_path, '\\'); + if (!last_sep) last_sep = strrchr(new_path, '/'); + if (last_sep && (size_t)(last_sep - new_path) < sizeof(dir_path) - 1) + { + size_t dir_len = last_sep - new_path; + memcpy(dir_path, new_path, dir_len); + dir_path[dir_len] = '\0'; + + /* open directory and flush */ + HANDLE dir_handle = CreateFile(dir_path, GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, + NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL); + if (dir_handle != INVALID_HANDLE_VALUE) + { + FlushFileBuffers(dir_handle); + CloseHandle(dir_handle); + } + } + + return 0; +#else + /* POSIX rename() is atomic and replaces existing files */ + if (rename(old_path, new_path) != 0) + { + return -1; + } + + /* we sync parent directory to ensure rename metadata is durable + * this is critical for crash safety on non-journaling filesystems + * https://groups.google.com/g/comp.unix.programmer/c/AM2V83RCOVE?pli=1 + * https://man7.org/linux/man-pages/man2/rename.2.html + */ + char dir_path[4096]; + const char *last_sep = strrchr(new_path, '/'); + if (last_sep && (size_t)(last_sep - new_path) < sizeof(dir_path) - 1) + { + size_t dir_len = last_sep - new_path; + memcpy(dir_path, new_path, dir_len); + dir_path[dir_len] = '\0'; + + const int dir_fd = open(dir_path, O_RDONLY); + if (dir_fd >= 0) + { + fsync(dir_fd); + close(dir_fd); + } + } + + return 0; +#endif +} + +/** + * atomic_rename_dir + * renames a directory from old_path to new_path + * on POSIX systems, rename() works for directories + * on Windows, rename() fails if target exists, so we use MoveFileEx + * NOTE: This does not replace existing directories -- caller must ensure target doesn't exist + * @param old_path the current path of the directory + * @param new_path the new path for the directory + * @return 0 on success, -1 on failure + */ +static inline int atomic_rename_dir(const char *old_path, const char *new_path) +{ + if (!old_path || !new_path) return -1; + +#ifdef _WIN32 + /* MoveFileEx works for directories on Windows + * Note -- MOVEFILE_REPLACE_EXISTING does not work for non-empty directories, + * so we don't use it here. Caller must ensure target doesn't exist. */ + if (!MoveFileEx(old_path, new_path, MOVEFILE_WRITE_THROUGH)) + { + errno = GetLastError(); + return -1; + } + + return 0; +#else + /* POSIX rename() works for directories */ + if (rename(old_path, new_path) != 0) + { + return -1; + } + + /* sync parent directory for durability */ + char dir_path[4096]; + const char *last_sep = strrchr(new_path, '/'); + if (last_sep && (size_t)(last_sep - new_path) < sizeof(dir_path) - 1) + { + size_t dir_len = last_sep - new_path; + memcpy(dir_path, new_path, dir_len); + dir_path[dir_len] = '\0'; + + const int dir_fd = open(dir_path, O_RDONLY); + if (dir_fd >= 0) + { + fsync(dir_fd); + close(dir_fd); + } + } + + return 0; +#endif +} + +/** + * tdb_get_cpu_count + * gets the number of available CPU cores + * @return number of CPU cores, or 4 as fallback + */ +static inline int tdb_get_cpu_count(void) +{ +#ifdef _WIN32 + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + return (int)sysinfo.dwNumberOfProcessors; +#elif defined(__APPLE__) + int count; + size_t count_len = sizeof(count); + if (sysctlbyname("hw.logicalcpu", &count, &count_len, NULL, 0) == 0) + { + return count; + } + return 4; /* fallback */ +#else + /* POSIX systems (Linux, BSD, etc.) */ + long count = sysconf(_SC_NPROCESSORS_ONLN); + if (count > 0) + { + return (int)count; + } + return 4; /* fallback */ +#endif +} + +/** + * tdb_get_cpu_id + * gets the current CPU core ID the calling thread is running on + * used for NUMA-aware partition routing + * @return current CPU ID, or 0 as fallback + */ +static inline int tdb_get_cpu_id(void) +{ +#if defined(__linux__) && (defined(__GLIBC__) || defined(__GNU_LIBRARY__)) + /* sched_getcpu() is a fast vDSO call (~5ns) on modern Linux */ + extern int sched_getcpu(void); + int cpu = sched_getcpu(); + return cpu >= 0 ? cpu : 0; +#elif defined(_WIN32) + return (int)GetCurrentProcessorNumber(); +#else + return 0; /* fallback -- no CPU detection */ +#endif +} + +/* + * tdb_get_current_time + * cross-platform function to get current Unix timestamp in seconds + * @return current Unix timestamp in seconds + */ +static inline time_t tdb_get_current_time(void) +{ +#if defined(_WIN32) + SYSTEMTIME st; + FILETIME ft; + GetSystemTime(&st); + SystemTimeToFileTime(&st, &ft); + ULARGE_INTEGER ui; + ui.LowPart = ft.dwLowDateTime; + ui.HighPart = ft.dwHighDateTime; + return (time_t)((ui.QuadPart - 116444736000000000ULL) / 10000000ULL); +#else + return time(NULL); +#endif +} + +/** + * tdb_gmtime_r + * cross-platform thread-safe gmtime + * @param timep pointer to time_t value + * @param result pointer to struct tm to fill + * @return pointer to result on success, NULL on failure + */ +static inline struct tm *tdb_gmtime_r(const time_t *timep, struct tm *result) +{ +#if defined(_WIN32) + return (gmtime_s(result, timep) == 0) ? result : NULL; +#else + return gmtime_r(timep, result); +#endif +} + +/** + * tdb_fmemopen + * cross-platform fmemopen + * opens a memory buffer as a FILE stream for reading + * @param buf pointer to memory buffer + * @param size size of buffer in bytes + * @param mode fopen mode string (e.g. "rb") + * @return FILE pointer or NULL on failure + */ +static inline FILE *tdb_fmemopen(void *buf, size_t size, const char *mode) +{ +#if defined(_WIN32) + /* windows has no fmemopen -- we write to a temp file and reopen */ + (void)mode; + char temp_path[MAX_PATH]; + char temp_file[MAX_PATH]; + if (GetTempPathA(MAX_PATH, temp_path) == 0) return NULL; + if (GetTempFileNameA(temp_path, "tdb", 0, temp_file) == 0) return NULL; + + FILE *fp = fopen(temp_file, "wb"); + if (!fp) return NULL; + + if (size > 0 && buf) + { + if (fwrite(buf, 1, size, fp) != size) + { + fclose(fp); + DeleteFileA(temp_file); + return NULL; + } + } + fclose(fp); + + fp = fopen(temp_file, "rb"); + DeleteFileA(temp_file); /* the file stays open until fclose */ + return fp; +#else + return fmemopen(buf, size, mode); +#endif +} + +#ifndef _WIN32 +#include /* getrlimit / RLIMIT_NOFILE for tdb_max_open_files */ +#endif + +/* fallback open-file ceilings used when the OS limit cannot be queried */ +#define TDB_FALLBACK_MAX_OPEN_FILES_POSIX 1024 /* POSIX-typical default RLIMIT_NOFILE soft cap */ +#define TDB_FALLBACK_MAX_OPEN_FILES_WIN \ + 2048 /* conservative floor for the Windows CRT low-IO layer */ + +/** + * tdb_max_open_files + * report the process's maximum number of simultaneously open file descriptors, so callers can + * size their fd budgets (e.g. max_open_sstables) to fit the OS limit. returns a conservative + * fallback when the limit cannot be determined or is unlimited. + * @return the open-file ceiling as a long + */ +static inline long tdb_max_open_files(void) +{ +#if defined(_WIN32) + /* windows has no RLIMIT_NOFILE. the CRT low-IO layer permits a large but not directly + * queryable number of _open handles; _getmaxstdio reports the (smaller) stdio stream cap. + * use the larger of that and a conservative floor so we neither over- nor under-budget. */ + const int stdio_cap = _getmaxstdio(); + const long win_floor = TDB_FALLBACK_MAX_OPEN_FILES_WIN; + return (stdio_cap > win_floor) ? (long)stdio_cap : win_floor; +#else + struct rlimit rl; + if (getrlimit(RLIMIT_NOFILE, &rl) == 0 && rl.rlim_cur != RLIM_INFINITY && rl.rlim_cur > 0) + return (long)rl.rlim_cur; + return TDB_FALLBACK_MAX_OPEN_FILES_POSIX; +#endif +} + +/** + * tdb_raise_max_open_files + * raise THIS process's open-file ceiling toward `desired` descriptors and return the ceiling in + * effect afterwards. POSIX raises the RLIMIT_NOFILE soft limit toward the hard limit (never + * lowering it, clamped to the hard limit); Windows raises the CRT stdio cap via _setmaxstdio + * (clamped to its 8192 maximum). an explicit, opt-in action -- tidesdb never raises the limit on + * its own. a partial or failed raise is non-fatal: the prior ceiling simply stands. + * @param desired target descriptor count; <= 0 just reports the current ceiling without raising + * @return the open-file ceiling after the attempt + */ +static inline long tdb_raise_max_open_files(long desired) +{ + if (desired <= 0) return tdb_max_open_files(); +#if defined(_WIN32) + if (desired > 8192) desired = 8192; /* _setmaxstdio hard maximum */ + if (desired > _getmaxstdio()) _setmaxstdio((int)desired); +#else + struct rlimit rl; + if (getrlimit(RLIMIT_NOFILE, &rl) == 0) + { + rlim_t target = (rlim_t)desired; + if (rl.rlim_max != RLIM_INFINITY && target > rl.rlim_max) target = rl.rlim_max; + /* macOS (and some BSDs) reject a soft limit above a kernel per-process cap even when the + * hard limit reads higher/unlimited, so back off and retry rather than giving up -- this + * lands the soft limit near the real ceiling instead of leaving it at the low default. */ + const rlim_t floor = rl.rlim_cur; + while (target > rl.rlim_cur) + { + struct rlimit attempt = rl; + attempt.rlim_cur = target; + if (setrlimit(RLIMIT_NOFILE, &attempt) == 0) + { + rl.rlim_cur = target; + break; + } + if (target <= floor + 1) break; /* even the smallest raise was refused */ + target = floor + (target - floor) / 2; + } + } +#endif + return tdb_max_open_files(); +} + +#endif /* __COMPAT_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/compress.c b/storage/tidesdb/libtidesdb/src/compress.c new file mode 100644 index 0000000000000..8b0e9930aa198 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/compress.c @@ -0,0 +1,252 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compress.h" + +/* the compression_algorithm enum values are an on-disk + ABI contract, they are written into + * sstable/vlog metadata, so they must never change, and the duplicate enum in db.h (the + * standalone FFI header, which cannot include this header) MUST hold identical values. pin them + * at compile time so any drift in compress.h fails the build; db.h carries the matching contract + * comment. guarded on C11 so older/non-conforming C front-ends still compile. */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +_Static_assert(TDB_COMPRESS_NONE == 0, "compression_algorithm wire drift: NONE must be 0"); +#ifndef __sun +_Static_assert(TDB_COMPRESS_SNAPPY == 1, "compression_algorithm wire drift: SNAPPY must be 1"); +#endif +_Static_assert(TDB_COMPRESS_LZ4 == 2, "compression_algorithm wire drift: LZ4 must be 2"); +_Static_assert(TDB_COMPRESS_ZSTD == 3, "compression_algorithm wire drift: ZSTD must be 3"); +_Static_assert(TDB_COMPRESS_LZ4_FAST == 4, "compression_algorithm wire drift: LZ4_FAST must be 4"); +#endif + +uint8_t *compress_data(const uint8_t *data, const size_t data_size, size_t *compressed_size, + const compression_algorithm type) +{ + uint8_t *compressed_data = NULL; + + if (TDB_UNLIKELY(!data)) + { + return NULL; + } + + switch (type) + { +#ifndef __sun + case TDB_COMPRESS_SNAPPY: + { + *compressed_size = snappy_max_compressed_length(data_size); + const size_t total_size = *compressed_size + sizeof(uint64_t); + compressed_data = malloc(total_size); + if (TDB_UNLIKELY(!compressed_data)) return NULL; + + encode_uint64_le_compat(compressed_data, data_size); + + size_t actual_size = *compressed_size; + if (TDB_UNLIKELY(snappy_compress((const char *)data, data_size, + (char *)(compressed_data + sizeof(uint64_t)), + &actual_size) != SNAPPY_OK)) + { + free(compressed_data); + return NULL; + } + + *compressed_size = actual_size + sizeof(uint64_t); + break; + } +#endif + + case TDB_COMPRESS_LZ4: + case TDB_COMPRESS_LZ4_FAST: + { + *compressed_size = (size_t)LZ4_compressBound((int)data_size); + const size_t total_size = *compressed_size + sizeof(uint64_t); + compressed_data = malloc(total_size); + if (TDB_UNLIKELY(!compressed_data)) return NULL; + + encode_uint64_le_compat(compressed_data, data_size); + + /* unified LZ4 path-- acceleration=1 for default, acceleration=2 for fast */ + const int acceleration = (type == TDB_COMPRESS_LZ4_FAST) ? 2 : 1; + const int lz4_result = + LZ4_compress_fast((const char *)data, (char *)(compressed_data + sizeof(uint64_t)), + (int)data_size, (int)*compressed_size, acceleration); + if (TDB_UNLIKELY(lz4_result <= 0)) + { + free(compressed_data); + return NULL; + } + + *compressed_size = (size_t)lz4_result + sizeof(uint64_t); + break; + } + + case TDB_COMPRESS_ZSTD: + { + *compressed_size = ZSTD_compressBound(data_size); + const size_t total_size = *compressed_size + sizeof(uint64_t); + compressed_data = malloc(total_size); + if (TDB_UNLIKELY(!compressed_data)) return NULL; + + encode_uint64_le_compat(compressed_data, data_size); + + const size_t actual_size = ZSTD_compress(compressed_data + sizeof(uint64_t), + *compressed_size, data, data_size, 1); + if (TDB_UNLIKELY(ZSTD_isError(actual_size))) + { + free(compressed_data); + return NULL; + } + + *compressed_size = actual_size + sizeof(uint64_t); + break; + } + + default: + return NULL; + } + + /* shrink buffer to actual compressed size to save memory and improve cache + * when the compressed data is stored or transmitted */ + if (TDB_LIKELY(compressed_data != NULL)) + { + uint8_t *shrunk = realloc(compressed_data, *compressed_size); + if (TDB_LIKELY(shrunk != NULL)) + { + compressed_data = shrunk; + } + } + + return compressed_data; +} + +uint8_t *decompress_data(const uint8_t *data, const size_t data_size, size_t *decompressed_size, + const compression_algorithm type) +{ + uint8_t *decompressed_data = NULL; + + if (TDB_UNLIKELY(!data)) return NULL; + + switch (type) + { +#ifndef __sun + case TDB_COMPRESS_SNAPPY: + { + if (TDB_UNLIKELY(data_size < sizeof(uint64_t))) + { + return NULL; + } + + const uint64_t original_size = decode_uint64_le_compat(data); + + if (TDB_UNLIKELY(original_size > UINT32_MAX)) + { + return NULL; + } + + *decompressed_size = (size_t)original_size; + + decompressed_data = malloc(*decompressed_size); + if (TDB_UNLIKELY(!decompressed_data)) return NULL; + + if (TDB_UNLIKELY(snappy_uncompress((const char *)(data + sizeof(uint64_t)), + data_size - sizeof(uint64_t), + (char *)decompressed_data, + decompressed_size) != SNAPPY_OK)) + { + free(decompressed_data); + return NULL; + } + /* verify produced length matches the size prefix, mirroring the LZ4/ZSTD branches. + * snappy_uncompress can succeed with a shorter output that still fits the buffer, + * which would otherwise pass silently. */ + if (TDB_UNLIKELY(*decompressed_size != (size_t)original_size)) + { + free(decompressed_data); + return NULL; + } + break; + } +#endif + + case TDB_COMPRESS_LZ4: + case TDB_COMPRESS_LZ4_FAST: + { + if (TDB_UNLIKELY(data_size < sizeof(uint64_t))) + { + return NULL; + } + + const uint64_t original_size = decode_uint64_le_compat(data); + + if (TDB_UNLIKELY(original_size > UINT32_MAX)) + { + return NULL; + } + + *decompressed_size = (size_t)original_size; + + decompressed_data = malloc(*decompressed_size); + if (TDB_UNLIKELY(!decompressed_data)) return NULL; + + const int lz4_result = LZ4_decompress_safe( + (const char *)(data + sizeof(uint64_t)), (char *)decompressed_data, + (int)(data_size - sizeof(uint64_t)), (int)*decompressed_size); + if (TDB_UNLIKELY(lz4_result < 0 || lz4_result != (int)*decompressed_size)) + { + free(decompressed_data); + return NULL; + } + break; + } + + case TDB_COMPRESS_ZSTD: + { + if (TDB_UNLIKELY(data_size < sizeof(uint64_t))) + { + return NULL; + } + + const uint64_t original_size = decode_uint64_le_compat(data); + + if (TDB_UNLIKELY(original_size > UINT32_MAX)) + { + return NULL; + } + + *decompressed_size = (size_t)original_size; + + decompressed_data = malloc(*decompressed_size); + if (TDB_UNLIKELY(!decompressed_data)) return NULL; + + const size_t zstd_result = + ZSTD_decompress(decompressed_data, *decompressed_size, data + sizeof(uint64_t), + data_size - sizeof(uint64_t)); + if (TDB_UNLIKELY(ZSTD_isError(zstd_result) || zstd_result != *decompressed_size)) + { + free(decompressed_data); + return NULL; + } + break; + } + + default: + return NULL; + } + + return decompressed_data; +} \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/compress.h b/storage/tidesdb/libtidesdb/src/compress.h new file mode 100644 index 0000000000000..91a8f4666ee56 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/compress.h @@ -0,0 +1,69 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __COMPRESS_H__ +#define __COMPRESS_H__ +#include +#ifndef __sun +#include +#endif +#include + +#include "compat.h" + +/* snappy, lz4, zstd supported to use for compression purposes */ +/* snappy is not available on SunOS/OmniOS/Illumos */ +/* ABI/on-disk contract, these numeric values are persisted in sstable/vlog metadata and are + * duplicated in db.h (the standalone FFI header). the two copies MUST stay identical; compress.c + * pins these values with _Static_assert to catch drift at build time. */ +typedef enum +{ + TDB_COMPRESS_NONE = 0, +#ifndef __sun + TDB_COMPRESS_SNAPPY = 1, +#endif + TDB_COMPRESS_LZ4 = 2, + TDB_COMPRESS_ZSTD = 3, + TDB_COMPRESS_LZ4_FAST = 4, +} compression_algorithm; + +/** + * compress_data + * compresses data using the specified compression algorithm + * @param data the data to compress + * @param data_size the size of the data + * @param compressed_size the size of the compressed data + * @param type the compression algorithm to use + * @return the compressed data + */ +uint8_t *compress_data(const uint8_t *data, size_t data_size, size_t *compressed_size, + compression_algorithm type); + +/** + * decompress_data + * decompresses data using the specified compression algorithm + * @param data the data to decompress + * @param data_size the size of the data + * @param decompressed_size the size of the decompressed data + * @param type the compression algorithm to use + * @return the decompressed data + */ +uint8_t *decompress_data(const uint8_t *data, size_t data_size, size_t *decompressed_size, + compression_algorithm type); + +#endif /* __COMPRESS_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/db.h b/storage/tidesdb/libtidesdb/src/db.h new file mode 100644 index 0000000000000..9b252c525337a --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/db.h @@ -0,0 +1,838 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __TIDESDB_DB_H__ +#define __TIDESDB_DB_H__ + +#include +#include +#include + +/** + * tidesdb_objstore_backend_t + * identifies the object store backend in use + */ +typedef enum +{ + TDB_BACKEND_FS = 0, + TDB_BACKEND_S3 = 1, + TDB_BACKEND_UNKNOWN = 99 +} tidesdb_objstore_backend_t; + +/** opaque types for FFI bindings (Java, etc.) */ +struct tidesdb_t +{ + int _opaque; +}; +struct tidesdb_column_family_t +{ + int _opaque; +}; +struct tidesdb_txn_t +{ + int _opaque; +}; +struct tidesdb_iter_t +{ + int _opaque; +}; +struct tidesdb_objstore_t +{ + int _opaque; +}; + +typedef struct tidesdb_t tidesdb_t; +typedef struct tidesdb_column_family_t tidesdb_column_family_t; +typedef struct tidesdb_txn_t tidesdb_txn_t; +typedef struct tidesdb_iter_t tidesdb_iter_t; +typedef struct tidesdb_objstore_t tidesdb_objstore_t; + +/** + * tidesdb_objstore_config_t + * configuration for object store mode behavior + * @param local_cache_path local directory for cached sstable files (NULL = use db_path) + * @param local_cache_max_bytes max local cache size in bytes (0 = unlimited) + * @param cache_on_read cache downloaded files locally (default 1) + * @param cache_on_write keep local copy after upload (default 1) + * @param max_concurrent_uploads parallel upload threads (default 4) + * @param max_concurrent_downloads parallel download threads (default 8) + * @param multipart_threshold use multipart upload above this size (default 64MB) + * @param multipart_part_size multipart chunk size (default 8MB) + * @param sync_manifest_to_object upload MANIFEST after each compaction (default 1) + * @param replicate_wal upload closed WAL segments for node-failure recovery (default 1) + * @param wal_upload_sync 0 = background WAL upload (default), 1 = block flush until uploaded + * @param wal_sync_threshold_bytes sync active WAL when it grows by this many bytes (default 1MB, 0 + * = off) + * @param wal_sync_on_commit upload WAL after every txn commit for RPO=0 replication (default 0) + * @param replica_mode enable read-only replica mode (default 0) + * @param replica_sync_interval_us MANIFEST poll interval in microseconds (default 5000000) + * @param replica_replay_wal replay WAL for near-real-time reads on replicas (default 1) + */ +typedef struct +{ + const char *local_cache_path; + size_t local_cache_max_bytes; + int cache_on_read; + int cache_on_write; + int max_concurrent_uploads; + int max_concurrent_downloads; + size_t multipart_threshold; + size_t multipart_part_size; + int sync_manifest_to_object; + int replicate_wal; + int wal_upload_sync; + size_t wal_sync_threshold_bytes; + int wal_sync_on_commit; + int replica_mode; + uint64_t replica_sync_interval_us; + int replica_replay_wal; +} tidesdb_objstore_config_t; + +tidesdb_objstore_config_t tidesdb_objstore_default_config(void); + +/** debug logging levels */ +typedef enum +{ + TDB_LOG_DEBUG = 0, + TDB_LOG_INFO = 1, + TDB_LOG_WARN = 2, + TDB_LOG_ERROR = 3, + TDB_LOG_FATAL = 4, + TDB_LOG_NONE = 99 +} tidesdb_log_level_t; + +/** txn isolation levels */ +typedef enum +{ + TDB_ISOLATION_READ_UNCOMMITTED = 0, + TDB_ISOLATION_READ_COMMITTED = 1, + TDB_ISOLATION_REPEATABLE_READ = 2, + TDB_ISOLATION_SNAPSHOT = 3, + TDB_ISOLATION_SERIALIZABLE = 4 +} tidesdb_isolation_level_t; + +/** compression algorithms */ +/* ABI/on-disk contract, these numeric values are persisted in sstable/vlog metadata and are + * duplicated in compress.h. the two copies MUST stay identical -- compress.c _Static_asserts the + * compress.h copy; keep this copy in lockstep. */ +typedef enum +{ + TDB_COMPRESS_NONE = 0, +#ifndef __sun + TDB_COMPRESS_SNAPPY = 1, +#endif + TDB_COMPRESS_LZ4 = 2, + TDB_COMPRESS_ZSTD = 3, + TDB_COMPRESS_LZ4_FAST = 4 +} compression_algorithm; + +/** column family sync modes */ +typedef enum +{ + TDB_SYNC_NONE = 0, + TDB_SYNC_FULL = 1, + TDB_SYNC_INTERVAL = 2 +} tidesdb_sync_mode_t; + +/** system error codes */ +#define TDB_SUCCESS 0 +#define TDB_ERR_MEMORY -1 +#define TDB_ERR_INVALID_ARGS -2 +#define TDB_ERR_NOT_FOUND -3 +#define TDB_ERR_IO -4 +#define TDB_ERR_CORRUPTION -5 +#define TDB_ERR_EXISTS -6 +#define TDB_ERR_CONFLICT -7 +#define TDB_ERR_TOO_LARGE -8 +#define TDB_ERR_MEMORY_LIMIT -9 +#define TDB_ERR_INVALID_DB -10 +#define TDB_ERR_UNKNOWN -11 +#define TDB_ERR_LOCKED -12 +#define TDB_ERR_READONLY -13 +#define TDB_ERR_BUSY -14 + +/** configuration limits */ +#define TDB_MAX_COMPARATOR_NAME 64 +#define TDB_MAX_COMPARATOR_CTX 256 +#define TDB_MAX_CF_NAME_LEN 128 + +/** comparator function type */ +typedef int (*tidesdb_comparator_fn)(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_commit_op_t + * represents a single operation in a committed transaction batch + * passed to the commit hook callback + * @param key pointer to key data (valid only during callback invocation) + * @param key_size size of key in bytes + * @param value pointer to value data (NULL for deletes, valid only during callback invocation) + * @param value_size size of value in bytes (0 for deletes) + * @param ttl time-to-live for the key-value pair (0 = no expiry) + * @param is_delete 1 if this is a delete operation, 0 for put + */ +typedef struct tidesdb_commit_op_t +{ + const uint8_t *key; + size_t key_size; + const uint8_t *value; + size_t value_size; + time_t ttl; + int is_delete; +} tidesdb_commit_op_t; + +/** + * tidesdb_commit_hook_fn + * callback function invoked synchronously after a transaction commits to a column family + * the callback receives the full batch of operations for that CF atomically + * the hook fires after WAL write, memtable apply, and commit status marking are complete + * hook failure is logged but does not roll back the commit (data is already durable) + * + * @param ops array of committed operations (valid only during callback invocation) + * @param num_ops number of operations in the array + * @param commit_seq monotonic commit sequence number + * @param ctx user-provided context pointer + * @return 0 on success, non-zero on failure (logged as warning) + */ +typedef int (*tidesdb_commit_hook_fn)(const tidesdb_commit_op_t *ops, int num_ops, + uint64_t commit_seq, void *ctx); + +/** + * tidesdb_column_family_config_t + * configuration for a column family + * @param name name of column family + * @param write_buffer_size size of write buffer + * @param level_size_ratio ratio of level sizes + * @param min_levels minimum number of levels + * @param dividing_level_offset offset for dividing level + * @param klog_value_threshold threshold for klog value + * @param compression_algorithm compression algorithm + * @param enable_bloom_filter enable bloom filter + * @param bloom_fpr bloom filter false positive rate + * @param enable_block_indexes enable block indexes + * @param index_sample_ratio index sample ratio + * @param block_index_prefix_len block index prefix length + * @param sync_mode sync mode + * @param sync_interval_us sync interval in microseconds + * @param comparator_name name of comparator + * @param comparator_ctx_str comparator context string + * @param comparator_fn_cached cached comparator function + * @param comparator_ctx_cached cached comparator context + * @param skip_list_max_level skip list max level + * @param skip_list_probability skip list probability + * @param default_isolation_level default isolation level + * @param min_disk_space minimum free disk space required (bytes) + * @param l1_file_count_trigger trigger for L1 file count, utilized for compaction triggering + * @param l0_queue_stall_threshold threshold for L0 queue stall, utilized for backpressure + * @param tombstone_density_trigger ratio in [0.0, 1.0] above which any single sstable's + * tombstone density (tombstone_count / num_entries) escalates + * compaction priority; 0.0 disables the check (default). + * sstables with fewer than tombstone_density_min_entries are + * ignored to prevent tiny-sstable noise. + * @param tombstone_density_min_entries minimum entry count for an sstable to be considered by + * the density trigger; 0 falls back to the default + * @param use_btree whether btree is used + * @param commit_hook_fn optional commit hook callback (NULL = disabled, runtime-only) + * @param commit_hook_ctx optional user context passed to commit hook (runtime-only) + * @param object_target_file_size reserved for API compatibility, not used.. will be retired + * completely + * @param object_lazy_compaction 1 = compact less aggressively in object store mode (default 0) + * @param object_prefetch_compaction 1 = download all inputs before merge (default 1) + */ +typedef struct tidesdb_column_family_config_t +{ + char name[TDB_MAX_CF_NAME_LEN]; + size_t write_buffer_size; + size_t level_size_ratio; + int min_levels; + int dividing_level_offset; + size_t klog_value_threshold; + compression_algorithm compression_algorithm; + int enable_bloom_filter; + double bloom_fpr; + int enable_block_indexes; + int index_sample_ratio; + int block_index_prefix_len; + int sync_mode; + uint64_t sync_interval_us; + char comparator_name[TDB_MAX_COMPARATOR_NAME]; + char comparator_ctx_str[TDB_MAX_COMPARATOR_CTX]; + void *comparator_fn_cached; + void *comparator_ctx_cached; + int skip_list_max_level; + float skip_list_probability; + tidesdb_isolation_level_t default_isolation_level; + uint64_t min_disk_space; + int l1_file_count_trigger; + int l0_queue_stall_threshold; + double tombstone_density_trigger; + uint64_t tombstone_density_min_entries; + int use_btree; + tidesdb_commit_hook_fn commit_hook_fn; + void *commit_hook_ctx; + size_t object_target_file_size; /* reserved, not used */ + int object_lazy_compaction; + int object_prefetch_compaction; +} tidesdb_column_family_config_t; + +/** + * tidesdb_config_t + * configuration for the database + * @param db_path path to the database + * @param num_flush_threads number of flush threads + * @param num_compaction_threads number of compaction threads + * @param log_level minimum log level to display (TDB_LOG_DEBUG, TDB_LOG_INFO, TDB_LOG_WARN, + * TDB_LOG_ERROR, TDB_LOG_FATAL, TDB_LOG_NONE) + * @param block_cache_size size of clock cache for hot sstable blocks + * @param max_open_sstables maximum number of open sstables + * @param log_to_file flag to determine if debug logging should be written to a file + * @param log_truncation_at size in bytes at which to truncate the log file, 0 = no truncation + * @param max_memory_usage maximum memory usage for the database + * @param unified_memtable flag to determine if unified memtable should be used + * @param unified_memtable_write_buffer_size write buffer size for unified memtable (0 = auto) + * @param unified_memtable_skip_list_max_level skip list max level for unified memtable (0 = default + * 12) + * @param unified_memtable_skip_list_probability skip list probability (0 = default 0.25) + * @param unified_memtable_sync_mode sync mode for unified WAL (default TDB_SYNC_NONE) + * @param unified_memtable_sync_interval_us sync interval for unified WAL (0 = default) + * @param object_store pluggable object store connector (NULL = local only, default) + * @param object_store_config object store behavior configuration (NULL = use defaults) + * @param max_concurrent_flushes global semaphore on the number of in-flight memtable flushes + * across all column families. bounds total transient memory and + * work-queue depth when many column families flush at once. + * 0 falls back to TDB_DEFAULT_MAX_CONCURRENT_FLUSHES. + */ +typedef struct tidesdb_config_t +{ + char *db_path; + int num_flush_threads; + int num_compaction_threads; + tidesdb_log_level_t log_level; + size_t block_cache_size; + size_t max_open_sstables; + int log_to_file; + size_t log_truncation_at; + size_t max_memory_usage; + int unified_memtable; + size_t unified_memtable_write_buffer_size; + int unified_memtable_skip_list_max_level; + float unified_memtable_skip_list_probability; + int unified_memtable_sync_mode; + uint64_t unified_memtable_sync_interval_us; + tidesdb_objstore_t *object_store; + tidesdb_objstore_config_t *object_store_config; + int max_concurrent_flushes; +} tidesdb_config_t; + +/** + * tidesdb_stats_t + * statistics for database column family + * @param num_levels number of levels + * @param memtable_size size of memtable + * @param level_sizes sizes of each level + * @param level_num_sstables number of sstables in each level + * @param config column family configuration + * @param total_keys total number of keys across memtable and all sstables + * @param total_data_size total data size (klog + vlog) across all sstables + * @param avg_key_size average key size in bytes + * @param avg_value_size average value size in bytes + * @param level_key_counts number of keys per level + * @param read_amp read amplification (point lookup cost multiplier) + * @param hit_rate cache hit rate (0.0 if cache disabled) + * @param use_btree whether btree is used + * @param btree_total_nodes total number of nodes in btree + * @param btree_max_height maximum height of btree + * @param btree_avg_height average height of btree + * @param total_tombstones sum of tombstone_count across every sstable in the cf + * @param tombstone_ratio total_tombstones / total_keys (0.0 if total_keys is 0) + * @param level_tombstone_counts tombstone count per level (parallels level_key_counts) + * @param max_sst_density worst per-sstable tombstone density observed in the cf + * @param max_sst_density_level 1-based level where max_sst_density was observed (0 if none) + */ +typedef struct tidesdb_stats_t +{ + int num_levels; + size_t memtable_size; + size_t *level_sizes; + int *level_num_sstables; + tidesdb_column_family_config_t *config; + uint64_t total_keys; + uint64_t total_data_size; + double avg_key_size; + double avg_value_size; + uint64_t *level_key_counts; + double read_amp; + double hit_rate; + int use_btree; + uint64_t btree_total_nodes; + uint32_t btree_max_height; + double btree_avg_height; + uint64_t total_tombstones; + double tombstone_ratio; + uint64_t *level_tombstone_counts; + double max_sst_density; + int max_sst_density_level; +} tidesdb_stats_t; + +/** + * tidesdb_cache_stats_t + * statistics for database block cache + * @param enabled whether block cache is enabled + * @param total_entries total number of cached entries + * @param total_bytes total bytes used by cache + * @param hits cache hits + * @param misses cache misses + * @param hit_rate hit rate (hits / (hits + misses)) + * @param num_partitions number of cache partitions + */ +typedef struct tidesdb_cache_stats_t +{ + int enabled; + size_t total_entries; + size_t total_bytes; + uint64_t hits; + uint64_t misses; + double hit_rate; + size_t num_partitions; +} tidesdb_cache_stats_t; + +/** + * tidesdb_db_stats_t + * database-level statistics + * @param num_column_families number of column families + * @param total_memory system total memory + * @param available_memory system available memory at open + * @param resolved_memory_limit resolved memory limit + * @param memory_pressure_level current memory pressure level (0=normal, 1=elevated, 2=high, + * 3=critical) + * @param flush_pending_count number of pending flush operations (queued + in-flight) + * @param total_memtable_bytes total bytes in active memtables across all CFs + * @param total_immutable_count total immutable memtables across all CFs + * @param total_sstable_count total sstables across all CFs and levels + * @param total_data_size_bytes total data size across all CFs + * @param num_open_sstables number of currently open sstable file handles + * @param global_seq current global sequence number + * @param txn_memory_bytes bytes held by in-flight transactions + * @param compaction_queue_size number of pending compaction tasks + * @param flush_queue_size number of pending flush tasks in queue + * @param unified_memtable_enabled whether unified memtable mode is active + * @param unified_memtable_bytes bytes in unified active memtable + * @param unified_immutable_count number of unified immutable memtables + * @param unified_is_flushing whether unified memtable is currently flushing/rotating + * @param unified_next_cf_index next CF index to be assigned in unified mode + * @param unified_wal_generation current unified WAL generation counter + * @param object_store_enabled whether object store mode is active + * @param object_store_connector connector name ("s3", "gcs", "fs", etc.) + * @param local_cache_bytes_used current local file cache usage in bytes + * @param local_cache_bytes_max configured maximum local cache size in bytes + * @param local_cache_num_files number of files tracked in local cache + * @param last_uploaded_generation highest WAL generation confirmed uploaded + * @param upload_queue_depth number of pending upload jobs in the queue + * @param total_uploads lifetime count of objects uploaded to object store + * @param total_upload_failures lifetime count of permanently failed uploads (after all retries) + * @param replica_mode whether running in read-only replica mode + */ +typedef struct tidesdb_db_stats_t +{ + int num_column_families; + uint64_t total_memory; + uint64_t available_memory; + size_t resolved_memory_limit; + int memory_pressure_level; + int flush_pending_count; + int64_t total_memtable_bytes; + int total_immutable_count; + int total_sstable_count; + uint64_t total_data_size_bytes; + int num_open_sstables; + uint64_t global_seq; + int64_t txn_memory_bytes; + size_t compaction_queue_size; + size_t flush_queue_size; + int unified_memtable_enabled; + int64_t unified_memtable_bytes; + int unified_immutable_count; + int unified_is_flushing; + uint32_t unified_next_cf_index; + uint64_t unified_wal_generation; + int object_store_enabled; + const char *object_store_connector; + size_t local_cache_bytes_used; + size_t local_cache_bytes_max; + int local_cache_num_files; + uint64_t last_uploaded_generation; + size_t upload_queue_depth; + uint64_t total_uploads; + uint64_t total_upload_failures; + int replica_mode; +} tidesdb_db_stats_t; + +/**** system default configuration functions */ +tidesdb_column_family_config_t tidesdb_default_column_family_config(void); +tidesdb_config_t tidesdb_default_config(void); + +/** + * tidesdb_raise_open_file_limit + * raise this process's open-file ceiling toward `desired` descriptors so a database can keep more + * sstables open -- the engine sizes max_open_sstables to fit this at open time, so call it BEFORE + * tidesdb_open. an explicit, opt-in operator action: tidesdb never raises the limit itself. POSIX + * (Linux, macOS, the BSDs, illumos) raises the RLIMIT_NOFILE soft limit toward the hard limit; + * Windows raises the CRT stdio cap (max 8192). a failed or partial raise is non-fatal. + * @param desired target descriptor count; <= 0 just reports the current ceiling + * @return the open-file ceiling in effect after the attempt + */ +long tidesdb_raise_open_file_limit(long desired); + +/**** initialization and custom allocator support */ + +/** + * tidesdb_malloc_fn + * function pointer type for malloc-like allocation + * @param size number of bytes to allocate + * @return pointer to allocated memory or NULL on failure + */ +typedef void *(*tidesdb_malloc_fn)(size_t size); + +/** + * tidesdb_calloc_fn + * function pointer type for calloc-like allocation + * @param count number of elements to allocate + * @param size size of each element in bytes + * @return pointer to zero-initialized memory or NULL on failure + */ +typedef void *(*tidesdb_calloc_fn)(size_t count, size_t size); + +/** + * tidesdb_realloc_fn + * function pointer type for realloc-like reallocation + * @param ptr pointer to previously allocated memory (or NULL) + * @param size new size in bytes + * @return pointer to reallocated memory or NULL on failure + */ +typedef void *(*tidesdb_realloc_fn)(void *ptr, size_t size); + +/** + * tidesdb_free_fn + * function pointer type for free-like deallocation + * @param ptr pointer to memory to free (may be NULL) + */ +typedef void (*tidesdb_free_fn)(void *ptr); + +/** + * tidesdb_init + * initializes TidesDB with optional custom memory allocation functions + * MUST be called exactly once before any other TidesDB function + * pass NULL for any function to use the default system allocator + * + * Example (Redis module): + * tidesdb_init(RedisModule_Alloc, RedisModule_Calloc, + * RedisModule_Realloc, RedisModule_Free); + * + * Example (system allocator): + * tidesdb_init(NULL, NULL, NULL, NULL); + * + * @param malloc_fn custom malloc function (or NULL for system malloc) + * @param calloc_fn custom calloc function (or NULL for system calloc) + * @param realloc_fn custom realloc function (or NULL for system realloc) + * @param free_fn custom free function (or NULL for system free) + * @return 0 on success, -1 if already initialized + */ +int tidesdb_init(tidesdb_malloc_fn malloc_fn, tidesdb_calloc_fn calloc_fn, + tidesdb_realloc_fn realloc_fn, tidesdb_free_fn free_fn); + +/** + * tidesdb_finalize + * finalizes TidesDB and resets the allocator + * should be called after all TidesDB operations are complete + * after calling this, tidesdb_init() can be called again + */ +void tidesdb_finalize(void); + +/**** database operations */ +int tidesdb_open(const tidesdb_config_t *config, tidesdb_t **db); +int tidesdb_close(tidesdb_t *db); + +/**** comparator operations */ +int tidesdb_register_comparator(tidesdb_t *db, const char *name, tidesdb_comparator_fn fn, + const char *ctx_str, void *ctx); +int tidesdb_get_comparator(tidesdb_t *db, const char *name, tidesdb_comparator_fn *fn, void **ctx); + +/**** column family operations */ +int tidesdb_create_column_family(tidesdb_t *db, const char *name, + const tidesdb_column_family_config_t *config); +int tidesdb_drop_column_family(tidesdb_t *db, const char *name); +int tidesdb_delete_column_family(tidesdb_t *db, tidesdb_column_family_t *cf); + +/** + * tidesdb_rename_column_family + * atomically renames a column family and its underlying directory + * waits for any in-progress flush/compaction to complete before renaming + * @param db database handle + * @param old_name current name of the column family + * @param new_name new name for the column family + * @return TDB_SUCCESS, TDB_ERR_NOT_FOUND, TDB_ERR_EXISTS, or TDB_ERR_IO + */ +int tidesdb_rename_column_family(tidesdb_t *db, const char *old_name, const char *new_name); +tidesdb_column_family_t *tidesdb_get_column_family(tidesdb_t *db, const char *name); +int tidesdb_list_column_families(tidesdb_t *db, char ***names, int *count); + +/**** transaction operations */ +int tidesdb_txn_begin(tidesdb_t *db, tidesdb_txn_t **txn); +int tidesdb_txn_begin_with_isolation(tidesdb_t *db, tidesdb_isolation_level_t isolation, + tidesdb_txn_t **txn); +int tidesdb_txn_put(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size, const uint8_t *value, size_t value_size, time_t ttl); +int tidesdb_txn_get(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size, uint8_t **value, size_t *value_size); +int tidesdb_txn_delete(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size); +int tidesdb_txn_single_delete(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size); +int tidesdb_txn_commit(tidesdb_txn_t *txn); +int tidesdb_txn_rollback(tidesdb_txn_t *txn); +int tidesdb_txn_reset(tidesdb_txn_t *txn, tidesdb_isolation_level_t isolation); +void tidesdb_txn_free(tidesdb_txn_t *txn); + +/**** savepoint operations */ +int tidesdb_txn_savepoint(tidesdb_txn_t *txn, const char *name); +int tidesdb_txn_rollback_to_savepoint(tidesdb_txn_t *txn, const char *name); +int tidesdb_txn_release_savepoint(tidesdb_txn_t *txn, const char *name); + +/**** iterator operations */ +int tidesdb_iter_new(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, tidesdb_iter_t **iter); +int tidesdb_iter_seek(tidesdb_iter_t *iter, const uint8_t *key, size_t key_size); +int tidesdb_iter_seek_for_prev(tidesdb_iter_t *iter, const uint8_t *key, size_t key_size); +int tidesdb_iter_seek_to_first(tidesdb_iter_t *iter); +int tidesdb_iter_seek_to_last(tidesdb_iter_t *iter); +int tidesdb_iter_next(tidesdb_iter_t *iter); +int tidesdb_iter_prev(tidesdb_iter_t *iter); +int tidesdb_iter_valid(tidesdb_iter_t *iter); +int tidesdb_iter_key(tidesdb_iter_t *iter, uint8_t **key, size_t *key_size); +int tidesdb_iter_value(tidesdb_iter_t *iter, uint8_t **value, size_t *value_size); +void tidesdb_iter_free(tidesdb_iter_t *iter); + +/**** comparator functions */ +int tidesdb_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); +int tidesdb_comparator_lexicographic(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); +int tidesdb_comparator_uint64(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); +int tidesdb_comparator_int64(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); +int tidesdb_comparator_reverse_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); +int tidesdb_comparator_case_insensitive(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/**** commit hook operations */ + +/** + * tidesdb_cf_set_commit_hook + * sets or clears the commit hook for a column family at runtime + * pass NULL for fn to disable the hook + * @param cf column family handle + * @param fn commit hook callback (or NULL to disable) + * @param ctx user-provided context passed to the callback + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS if cf is NULL + */ +int tidesdb_cf_set_commit_hook(tidesdb_column_family_t *cf, tidesdb_commit_hook_fn fn, void *ctx); + +/**** maintenance operations */ +int tidesdb_compact(tidesdb_column_family_t *cf); + +/** + * tidesdb_compact_range + * synchronously compacts every sstable whose key range overlaps [start_key, end_key). + * output is merged toward the largest level affected. NULL endpoints are unbounded. + * both NULL is rejected so callers go through tidesdb_compact for full cf compaction. + * @return TDB_SUCCESS, TDB_ERR_INVALID_ARGS for bad args, TDB_ERR_LOCKED if another + * compaction is running, or other error codes from the underlying merge + */ +int tidesdb_compact_range(tidesdb_column_family_t *cf, const uint8_t *start_key, + size_t start_key_size, const uint8_t *end_key, size_t end_key_size); + +int tidesdb_flush_memtable(tidesdb_column_family_t *cf); + +/** + * tidesdb_is_flushing + * check if a column family has a flush operation in progress + * @param cf column family handle + * @return 1 if flushing, 0 otherwise + */ +int tidesdb_is_flushing(tidesdb_column_family_t *cf); + +/** + * tidesdb_is_compacting + * check if a column family has a compaction operation in progress + * @param cf column family handle + * @return 1 if compacting, 0 otherwise + */ +int tidesdb_is_compacting(tidesdb_column_family_t *cf); +int tidesdb_backup(tidesdb_t *db, char *dir); +int tidesdb_checkpoint(tidesdb_t *db, const char *checkpoint_dir); + +/** + * tidesdb_clone_column_family + * clones an existing column family to a new column family with a different name + * @param db database handle + * @param src_name name of the source column family to clone + * @param dst_name name for the new cloned column family + * @return TDB_SUCCESS, TDB_ERR_NOT_FOUND, TDB_ERR_EXISTS, or other error codes + */ +int tidesdb_clone_column_family(tidesdb_t *db, const char *src_name, const char *dst_name); + +/** + * tidesdb_purge_cf + * forces a full flush of the active memtable and triggers aggressive compaction for a column + * family. waits for all flush and compaction I/O to complete before returning. + * @param cf column family handle + * @return 0 on success, -n on failure + */ +int tidesdb_purge_cf(tidesdb_column_family_t *cf); + +/** + * tidesdb_purge + * forces a full flush and aggressive compaction for all column families. + * waits for all flush and compaction queues to fully drain before returning. + * @param db database handle + * @return 0 on success, first non-zero error code on failure + */ +int tidesdb_purge(tidesdb_t *db); + +/** + * tidesdb_cancel_background_work + * cancels background compaction db-wide (in-flight merges bail safely, queued + * compaction is skipped); flushes are unaffected so durability is preserved. blocks + * (bounded) until compaction is idle. sticky for the session, reset on next open -- + * intended to be called right before tidesdb_close for a fast shutdown. + * @param db database handle + * @return TDB_SUCCESS, or TDB_ERR_INVALID_ARGS if db is NULL + */ +int tidesdb_cancel_background_work(tidesdb_t *db); + +/**** configuration operations */ +int tidesdb_cf_config_load_from_ini(const char *ini_file, const char *section_name, + tidesdb_column_family_config_t *config); +int tidesdb_cf_config_save_to_ini(const char *ini_file, const char *section_name, + const tidesdb_column_family_config_t *config); +int tidesdb_cf_update_runtime_config(tidesdb_column_family_t *cf, + const tidesdb_column_family_config_t *new_config, + int persist_to_disk); + +/**** statistics operations */ +int tidesdb_get_stats(tidesdb_column_family_t *cf, tidesdb_stats_t **stats); +void tidesdb_free_stats(tidesdb_stats_t *stats); +int tidesdb_get_db_stats(tidesdb_t *db, tidesdb_db_stats_t *stats); +int tidesdb_get_cache_stats(tidesdb_t *db, tidesdb_cache_stats_t *stats); + +int tidesdb_range_cost(tidesdb_column_family_t *cf, const uint8_t *key_a, size_t key_a_size, + const uint8_t *key_b, size_t key_b_size, double *cost); + +void tidesdb_free(void *ptr); + +int tidesdb_sync_wal(tidesdb_column_family_t *cf); + +/**** object store connector factories */ + +/** + * tidesdb_objstore_fs_create + * create a filesystem-backed object store connector for testing and local replication + * stores objects as files under root_dir mirroring the key path structure + * @param root_dir directory to store objects in + * @return connector handle or NULL on error + */ +tidesdb_objstore_t *tidesdb_objstore_fs_create(const char *root_dir); + +/** + * tidesdb_objstore_s3_create + * create an S3-compatible object store connector (AWS S3, MinIO, etc.). + * the library must have been built with TIDESDB_WITH_S3=ON; otherwise the + * symbol is unresolved at link time. + * @param endpoint S3 endpoint (e.g. "s3.amazonaws.com" or "minio.local:9000") + * @param bucket bucket name + * @param prefix key prefix (e.g. "production/db1/"), may be NULL + * @param access_key AWS access key ID + * @param secret_key AWS secret access key + * @param region AWS region (e.g. "us-east-1"), NULL for MinIO + * @param use_ssl 1 for HTTPS, 0 for HTTP + * @param use_path_style 1 for path-style URLs (MinIO), 0 for virtual-hosted (AWS) + * @return connector handle, or NULL on error + */ +tidesdb_objstore_t *tidesdb_objstore_s3_create(const char *endpoint, const char *bucket, + const char *prefix, const char *access_key, + const char *secret_key, const char *region, + int use_ssl, int use_path_style); + +/** + * tidesdb_objstore_s3_config_t + * full S3 connector configuration, including TLS and multipart tuning the positional + * tidesdb_objstore_s3_create cannot express. zero-initialize and set what you need; the + * all-zero defaults are secure (TLS verify on, no custom CA) and use the built-in multipart + * sizes. + * @param endpoint S3 endpoint (required) + * @param bucket bucket name (required) + * @param prefix key prefix, or NULL + * @param access_key AWS access key ID (required) + * @param secret_key AWS secret access key (required) + * @param region AWS region, or NULL for the default + * @param use_ssl 1 for HTTPS, 0 for HTTP + * @param use_path_style 1 for path-style URLs (MinIO), 0 for virtual-hosted (AWS) + * @param tls_ca_path custom CA bundle file path, or NULL for the system bundle + * @param tls_insecure_skip_verify 1 disables TLS peer+host verification (test only, insecure); + * 0 keeps verification on (default) + * @param multipart_threshold object size at/above which multipart upload is used; 0 = default + * @param multipart_part_size multipart chunk size in bytes; 0 = default + */ +typedef struct +{ + const char *endpoint; + const char *bucket; + const char *prefix; + const char *access_key; + const char *secret_key; + const char *region; + int use_ssl; + int use_path_style; + const char *tls_ca_path; + int tls_insecure_skip_verify; + size_t multipart_threshold; + size_t multipart_part_size; +} tidesdb_objstore_s3_config_t; + +/** + * tidesdb_objstore_s3_create_config + * create an S3-compatible connector from a full configuration struct (TLS + multipart). + * tidesdb_objstore_s3_create is a thin wrapper over this with secure/default settings. + * @param config connector configuration (fields are copied; need not outlive the call) + * @return connector handle, or NULL on error + */ +tidesdb_objstore_t *tidesdb_objstore_s3_create_config(const tidesdb_objstore_s3_config_t *config); + +/** + * tidesdb_promote_to_primary + * switch a read-only replica to primary mode + * @param db database handle in replica mode + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS if not a replica + */ +int tidesdb_promote_to_primary(tidesdb_t *db); + +int tidesdb_iter_key_value(tidesdb_iter_t *iter, uint8_t **key, size_t *key_size, uint8_t **value, + size_t *value_size); + +#endif /* __TIDESDB_DB_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/local_cache.c b/storage/tidesdb/libtidesdb/src/local_cache.c new file mode 100644 index 0000000000000..b4dacbbff5497 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/local_cache.c @@ -0,0 +1,371 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "local_cache.h" + +#include +#include + +#include "xxhash.h" + +#define TDB_LOCAL_CACHE_KLOG_EXT ".klog" +#define TDB_LOCAL_CACHE_VLOG_EXT ".vlog" +/* both partner extensions must be the same length for the swap-trick in + * cache_evict_partner to produce a valid path */ +#define TDB_LOCAL_CACHE_EXT_LEN (sizeof(TDB_LOCAL_CACHE_KLOG_EXT) - 1) + +/** + * cache_hash + * XXH32 hash of a file path for bucket lookup + * @param path file path to hash + * @return hash value + */ +static uint32_t cache_hash(const char *path) +{ + return XXH32(path, strlen(path), 0); +} + +/** + * cache_bucket + * return the bucket index for a hash value + * @param h hash value + * @return bucket index + */ +static inline uint32_t cache_bucket(uint32_t h) +{ + return h & (TDB_LOCAL_CACHE_HASH_BUCKETS - 1); +} + +int tdb_local_cache_init(tdb_local_cache_t *cache, const char *cache_dir, size_t max_bytes) +{ + if (!cache || !cache_dir) return -1; + + memset(cache, 0, sizeof(*cache)); + snprintf(cache->cache_dir, sizeof(cache->cache_dir), "%s", cache_dir); + cache->max_bytes = max_bytes; + atomic_init(&cache->current_bytes, 0); + pthread_mutex_init(&cache->lock, NULL); + cache->lru_head = NULL; + cache->lru_tail = NULL; + atomic_init(&cache->num_entries, 0); + memset(cache->buckets, 0, sizeof(cache->buckets)); + + return 0; +} + +void tdb_local_cache_destroy(tdb_local_cache_t *cache) +{ + if (!cache) return; + + pthread_mutex_lock(&cache->lock); + + tdb_cache_entry_t *cur = cache->lru_head; + while (cur) + { + tdb_cache_entry_t *next = cur->next; + free(cur); + cur = next; + } + cache->lru_head = NULL; + cache->lru_tail = NULL; + atomic_store(&cache->num_entries, 0); + atomic_store(&cache->current_bytes, 0); + memset(cache->buckets, 0, sizeof(cache->buckets)); + + pthread_mutex_unlock(&cache->lock); + pthread_mutex_destroy(&cache->lock); +} + +/** + * lru_unlink + * unlink an entry from the doubly-linked LRU list + * @param cache the cache manager + * @param entry entry to unlink (must be in the list) + * caller must hold cache->lock + */ +static void lru_unlink(tdb_local_cache_t *cache, tdb_cache_entry_t *entry) +{ + if (entry->prev) + entry->prev->next = entry->next; + else + cache->lru_head = entry->next; + + if (entry->next) + entry->next->prev = entry->prev; + else + cache->lru_tail = entry->prev; + + entry->prev = NULL; + entry->next = NULL; +} + +/** + * lru_push_head + * insert an entry at the head (most recently used) of the LRU list + * @param cache the cache manager + * @param entry entry to insert + * caller must hold cache->lock + */ +static void lru_push_head(tdb_local_cache_t *cache, tdb_cache_entry_t *entry) +{ + entry->prev = NULL; + entry->next = cache->lru_head; + if (cache->lru_head) + cache->lru_head->prev = entry; + else + cache->lru_tail = entry; + cache->lru_head = entry; +} + +/** + * hash_insert + * insert an entry into the hash table + * @param cache the cache manager + * @param entry entry to insert + * caller must hold cache->lock + */ +static void hash_insert(tdb_local_cache_t *cache, tdb_cache_entry_t *entry) +{ + uint32_t idx = cache_bucket(entry->hash); + entry->hash_next = cache->buckets[idx]; + cache->buckets[idx] = entry; +} + +/** + * hash_remove + * remove an entry from the hash table + * @param cache the cache manager + * @param entry entry to remove + * caller must hold cache->lock + */ +static void hash_remove(tdb_local_cache_t *cache, tdb_cache_entry_t *entry) +{ + uint32_t idx = cache_bucket(entry->hash); + tdb_cache_entry_t **pp = &cache->buckets[idx]; + while (*pp) + { + if (*pp == entry) + { + *pp = entry->hash_next; + entry->hash_next = NULL; + return; + } + pp = &(*pp)->hash_next; + } +} + +/** + * hash_find + * find an entry by file path in the hash table (O(1) average) + * @param cache the cache manager + * @param path file path to search for + * @param h precomputed hash of path + * @return the entry if found, NULL otherwise + * caller must hold cache->lock + */ +static tdb_cache_entry_t *hash_find(tdb_local_cache_t *cache, const char *path, uint32_t h) +{ + uint32_t idx = cache_bucket(h); + tdb_cache_entry_t *cur = cache->buckets[idx]; + while (cur) + { + if (cur->hash == h && strcmp(cur->path, path) == 0) return cur; + cur = cur->hash_next; + } + return NULL; +} + +/** + * cache_remove_entry + * fully remove an entry from both hash table and LRU list, update accounting, + * and optionally delete the file from disk + * @param cache the cache manager + * @param entry entry to remove + * @param current pointer to running byte counter + * @param delete_file 1 to unlink file from disk, 0 to just untrack + * caller must hold cache->lock + */ +static void cache_remove_entry(tdb_local_cache_t *cache, tdb_cache_entry_t *entry, size_t *current, + int delete_file) +{ + lru_unlink(cache, entry); + hash_remove(cache, entry); + + if (delete_file) + { + /* tdb_unlink clears the Windows read-only attribute that can otherwise block + * deletion. surface a failure, a swallowed unlink error leaks the file on disk + * while the byte counter below is decremented as if reclaimed. this leaf module has + * no db log, so stderr is the available channel. the counter is still decremented + * because the entry is being untracked regardless -- the leak is an OS-level issue + * the operator must clear, not a tracker-accounting one. */ + if (tdb_unlink(entry->path) != 0) + { + fprintf(stderr, "tidesdb local_cache: failed to unlink %s; file leaked on disk\n", + entry->path); + } + } + + *current -= entry->size; + atomic_store_explicit(&cache->current_bytes, *current, memory_order_relaxed); + atomic_fetch_sub_explicit(&cache->num_entries, 1, memory_order_relaxed); +} + +/** + * cache_evict_partner + * if the victim is a TDB_LOCAL_CACHE_KLOG_EXT or TDB_LOCAL_CACHE_VLOG_EXT file, find and evict its + * partner so sstable file pairs are always evicted together + * @param cache the cache manager + * @param victim the entry being evicted + * @param current pointer to the running byte counter + * caller must hold cache->lock + */ +static void cache_evict_partner(tdb_local_cache_t *cache, const tdb_cache_entry_t *victim, + size_t *current) +{ + size_t vlen = strlen(victim->path); + if (vlen < TDB_LOCAL_CACHE_EXT_LEN) return; + + const char *ext = victim->path + vlen - TDB_LOCAL_CACHE_EXT_LEN; + const char *partner_ext = NULL; + + if (strcmp(ext, TDB_LOCAL_CACHE_KLOG_EXT) == 0) + partner_ext = TDB_LOCAL_CACHE_VLOG_EXT; + else if (strcmp(ext, TDB_LOCAL_CACHE_VLOG_EXT) == 0) + partner_ext = TDB_LOCAL_CACHE_KLOG_EXT; + + if (!partner_ext) return; + + char partner_path[TDB_LOCAL_CACHE_MAX_PATH]; + memcpy(partner_path, victim->path, vlen - TDB_LOCAL_CACHE_EXT_LEN); + memcpy(partner_path + vlen - TDB_LOCAL_CACHE_EXT_LEN, partner_ext, TDB_LOCAL_CACHE_EXT_LEN); + partner_path[vlen] = '\0'; + + uint32_t ph = cache_hash(partner_path); + tdb_cache_entry_t *partner = hash_find(cache, partner_path, ph); + if (!partner) return; + + cache_remove_entry(cache, partner, current, 1); + free(partner); +} + +/** + * cache_evict + * evict LRU entries (from tail) until enough space is available + * @param cache the cache manager + * @param bytes_needed number of bytes needed for the new entry + * caller must hold cache->lock + */ +static void cache_evict(tdb_local_cache_t *cache, size_t bytes_needed) +{ + if (cache->max_bytes == 0) return; /* unlimited */ + + size_t current = atomic_load_explicit(&cache->current_bytes, memory_order_relaxed); + while (current + bytes_needed > cache->max_bytes && cache->lru_tail) + { + tdb_cache_entry_t *victim = cache->lru_tail; + cache_remove_entry(cache, victim, ¤t, 1); + + /* we evict the klog/vlog partner so sstable pairs stay together */ + cache_evict_partner(cache, victim, ¤t); + + free(victim); + } +} + +int tdb_local_cache_track(tdb_local_cache_t *cache, const char *local_path) +{ + if (!cache || !local_path) return -1; + + struct stat st; + if (stat(local_path, &st) != 0) return -1; + + size_t file_size = (size_t)st.st_size; + uint32_t h = cache_hash(local_path); + + pthread_mutex_lock(&cache->lock); + + /* we check if already tracked via hash lookup (O(1)) */ + tdb_cache_entry_t *existing = hash_find(cache, local_path, h); + if (existing) + { + /* we move to head (touch) */ + lru_unlink(cache, existing); + lru_push_head(cache, existing); + pthread_mutex_unlock(&cache->lock); + return 0; + } + + /* we evict if needed */ + cache_evict(cache, file_size); + + tdb_cache_entry_t *entry = calloc(1, sizeof(tdb_cache_entry_t)); + if (!entry) + { + pthread_mutex_unlock(&cache->lock); + return -1; + } + + snprintf(entry->path, sizeof(entry->path), "%s", local_path); + entry->size = file_size; + entry->hash = h; + lru_push_head(cache, entry); + hash_insert(cache, entry); + atomic_fetch_add_explicit(&cache->num_entries, 1, memory_order_relaxed); + atomic_fetch_add_explicit(&cache->current_bytes, file_size, memory_order_relaxed); + + pthread_mutex_unlock(&cache->lock); + return 0; +} + +void tdb_local_cache_touch(tdb_local_cache_t *cache, const char *local_path) +{ + if (!cache || !local_path) return; + + uint32_t h = cache_hash(local_path); + + pthread_mutex_lock(&cache->lock); + + tdb_cache_entry_t *entry = hash_find(cache, local_path, h); + if (entry) + { + lru_unlink(cache, entry); + lru_push_head(cache, entry); + } + + pthread_mutex_unlock(&cache->lock); +} + +void tdb_local_cache_remove(tdb_local_cache_t *cache, const char *local_path) +{ + if (!cache || !local_path) return; + + uint32_t h = cache_hash(local_path); + + pthread_mutex_lock(&cache->lock); + + tdb_cache_entry_t *entry = hash_find(cache, local_path, h); + if (entry) + { + size_t current = atomic_load_explicit(&cache->current_bytes, memory_order_relaxed); + cache_remove_entry(cache, entry, ¤t, 0); + free(entry); + } + + pthread_mutex_unlock(&cache->lock); +} diff --git a/storage/tidesdb/libtidesdb/src/local_cache.h b/storage/tidesdb/libtidesdb/src/local_cache.h new file mode 100644 index 0000000000000..8b2fcbc2bad00 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/local_cache.h @@ -0,0 +1,119 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __LOCAL_CACHE_H__ +#define __LOCAL_CACHE_H__ + +#include "compat.h" + +#define TDB_LOCAL_CACHE_MAX_PATH 4096 +#define TDB_LOCAL_CACHE_HASH_BUCKETS 256 /* power of 2 for bitmask lookup */ + +/** + * tdb_cache_entry_t + * doubly-linked LRU list node tracking a cached file, also chained in a hash bucket + * @param path file path of the cached file + * @param size size of the cached file in bytes + * @param prev pointer to the previous entry in the LRU list + * @param next pointer to the next entry in the LRU list + * @param hash_next pointer to the next entry in the same hash bucket + * @param hash value of the path hash (cached to avoid recomputation on remove) + */ +typedef struct tdb_cache_entry +{ + char path[TDB_LOCAL_CACHE_MAX_PATH]; + size_t size; + struct tdb_cache_entry *prev; + struct tdb_cache_entry *next; + struct tdb_cache_entry *hash_next; + uint32_t hash; +} tdb_cache_entry_t; + +/** + * tdb_local_cache_t + * local file cache manager with hash-indexed LRU eviction for object store mode. + * tracks which sstable files are cached locally and evicts cold files + * when the cache exceeds max_bytes. uses a hash table for O(1) lookups + * and a doubly-linked LRU list for eviction ordering. + * @param cache_dir directory path for cached files + * @param max_bytes maximum cache size in bytes (0 = unlimited) + * @param current_bytes atomic counter of current cache size in bytes + * @param lock mutex protecting the LRU list and hash table + * @param lru_head pointer to the most recently used entry + * @param lru_tail pointer to the least recently used entry (eviction candidate) + * @param num_entries atomic counter of entries currently in the cache + * @param buckets hash table buckets for O(1) path lookups + */ +typedef struct +{ + char cache_dir[TDB_LOCAL_CACHE_MAX_PATH]; + size_t max_bytes; /* 0 = unlimited */ + _Atomic(size_t) current_bytes; + pthread_mutex_t lock; + tdb_cache_entry_t *lru_head; + tdb_cache_entry_t *lru_tail; + _Atomic(int) num_entries; + tdb_cache_entry_t *buckets[TDB_LOCAL_CACHE_HASH_BUCKETS]; +} tdb_local_cache_t; + +/** + * tdb_local_cache_init + * initialize the local file cache manager + * @param cache cache struct to initialize + * @param cache_dir local directory for cached files + * @param max_bytes maximum cache size in bytes (0 = unlimited) + * @return 0 on success, -1 on error + */ +int tdb_local_cache_init(tdb_local_cache_t *cache, const char *cache_dir, size_t max_bytes); + +/** + * tdb_local_cache_destroy + * free all tracking entries and destroy mutex. + * does not delete cached files from disk (they persist for next startup). + * @param cache cache to destroy + */ +void tdb_local_cache_destroy(tdb_local_cache_t *cache); + +/** + * tdb_local_cache_track + * register a file in the cache. stats the file for size, adds to LRU head, + * and triggers eviction if the cache is over its size limit. + * @param cache cache manager + * @param local_path path to the cached file + * @return 0 on success, -1 on error + */ +int tdb_local_cache_track(tdb_local_cache_t *cache, const char *local_path); + +/** + * tdb_local_cache_touch + * move an existing cached file to the head of the LRU list (mark as recently used). + * no-op if the file is not tracked. + * @param cache cache manager + * @param local_path path to the cached file + */ +void tdb_local_cache_touch(tdb_local_cache_t *cache, const char *local_path); + +/** + * tdb_local_cache_remove + * remove a file from cache tracking. does not delete the file from disk. + * @param cache cache manager + * @param local_path path to remove + */ +void tdb_local_cache_remove(tdb_local_cache_t *cache, const char *local_path); + +#endif /* __LOCAL_CACHE_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/manifest.c b/storage/tidesdb/libtidesdb/src/manifest.c new file mode 100644 index 0000000000000..8025bbdeedcad --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/manifest.c @@ -0,0 +1,498 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "manifest.h" + +#include +#include +#include +#include +#include + +#define MANIFEST_TMP_EXT ".tmp." +#define MANIFEST_TMP_EXT_LEN (sizeof(MANIFEST_TMP_EXT) - 1) + +/** + * tidesdb_manifest_add_sstable_unlocked + * adds an sstable to the manifest + * @param manifest manifest to add sstable to + * @param level level of sstable + * @param id id of sstable + * @param num_entries number of entries in sstable + * @param size_bytes size of sstable in bytes + * @return 0 on success, -1 on error + */ +static int tidesdb_manifest_add_sstable_unlocked(tidesdb_manifest_t *manifest, int level, + uint64_t id, uint64_t num_entries, + uint64_t size_bytes); + +tidesdb_manifest_t *tidesdb_manifest_open(const char *path) +{ + if (!path) return NULL; + + tidesdb_manifest_t *manifest = malloc(sizeof(tidesdb_manifest_t)); + if (!manifest) return NULL; + + manifest->entries = malloc(sizeof(tidesdb_manifest_entry_t) * MANIFEST_INITIAL_CAPACITY); + if (!manifest->entries) + { + free(manifest); + return NULL; + } + + manifest->num_entries = 0; + manifest->capacity = MANIFEST_INITIAL_CAPACITY; + atomic_init(&manifest->sequence, 0); + manifest->fp = NULL; + atomic_init(&manifest->active_ops, 0); + strncpy(manifest->path, path, MANIFEST_PATH_LEN - 1); + manifest->path[MANIFEST_PATH_LEN - 1] = '\0'; + + if (pthread_rwlock_init(&manifest->lock, NULL) != 0) + { + free(manifest->entries); + free(manifest); + return NULL; + } + + /* we clean up orphaned temp files from incomplete commits + * temp files are named -- MANIFEST_TMP_EXT. + * if main manifest exists, temp files are stale and can be removed */ + char dir_path[MANIFEST_PATH_LEN]; + const char *last_sep = strrchr(path, PATH_SEPARATOR[0]); + if (last_sep) + { + const size_t dir_len = last_sep - path; + if (dir_len < sizeof(dir_path)) + { + memcpy(dir_path, path, dir_len); + dir_path[dir_len] = '\0'; + } + else + { + strcpy(dir_path, "."); + } + } + else + { + strcpy(dir_path, "."); + } + + /* base filename for pattern matching */ + const char *base_name = last_sep ? last_sep + 1 : path; + const size_t base_len = strlen(base_name); + + /* we scan directory looking for orphaned temp files */ + DIR *dir = opendir(dir_path); + if (dir) + { + const size_t dir_path_len = strlen(dir_path); + const size_t sep_len = strlen(PATH_SEPARATOR); + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) + { + /* we check if filename matches pattern -- MANIFEST_TMP_EXT* */ + const size_t entry_len = strlen(entry->d_name); + if (entry_len > base_len + MANIFEST_TMP_EXT_LEN && + strncmp(entry->d_name, base_name, base_len) == 0 && + strncmp(entry->d_name + base_len, MANIFEST_TMP_EXT, MANIFEST_TMP_EXT_LEN) == 0) + { + /* found orphaned temp file, we remove it */ + char temp_full_path[MANIFEST_PATH_LEN]; + /* we check if combined path fits in buffer (dir + separator + entry + null) */ + if (dir_path_len + sep_len + entry_len + 1 <= MANIFEST_PATH_LEN) + { + size_t offset = 0; + memcpy(temp_full_path + offset, dir_path, dir_path_len); + offset += dir_path_len; + memcpy(temp_full_path + offset, PATH_SEPARATOR, sep_len); + offset += sep_len; + memcpy(temp_full_path + offset, entry->d_name, entry_len); + offset += entry_len; + temp_full_path[offset] = '\0'; + remove(temp_full_path); + } + } + } + closedir(dir); + } + + FILE *fp = tdb_fopen(path, "r"); + if (!fp) + { + /* the file doesnt exist, return empty manifest */ + if (errno == ENOENT) return manifest; + /* other error */ + pthread_rwlock_destroy(&manifest->lock); + free(manifest->entries); + free(manifest); + return NULL; + } + + char line[MANIFEST_MAX_LINE_LEN]; + + if (fgets(line, sizeof(line), fp)) + { + char *endptr; + const long version = strtol(line, &endptr, 10); + if (endptr == line || version != MANIFEST_VERSION) + { + fclose(fp); + pthread_rwlock_destroy(&manifest->lock); + free(manifest->entries); + free(manifest); + return NULL; + } + } + else + { + /* empty file, keep it open */ + manifest->fp = fp; + return manifest; + } + + if (fgets(line, sizeof(line), fp)) + { + char *seq_endptr; + const unsigned long long seq = strtoull(line, &seq_endptr, 10); + /* the sequence line must be a number terminated by end-of-line. reject junk + * (e.g. "123abc") rather than silently truncating it -- an under-parsed + * sequence under-seeds next_sstable_id on recovery and risks id collisions */ + if (seq_endptr == line || + (*seq_endptr != '\0' && *seq_endptr != '\n' && *seq_endptr != '\r')) + { + fclose(fp); + pthread_rwlock_destroy(&manifest->lock); + free(manifest->entries); + free(manifest); + return NULL; + } + atomic_store(&manifest->sequence, seq); + } + + int skipped_lines = 0; + while (fgets(line, sizeof(line), fp)) + { + const char *ptr = line; + char *endptr; + + /* parse level */ + const long level_val = strtol(ptr, &endptr, 10); + if (endptr == ptr || *endptr != ',') + { + skipped_lines++; + continue; + } + const int level = (int)level_val; + ptr = endptr + 1; + + /* parse id */ + const uint64_t id = strtoull(ptr, &endptr, 10); + if (endptr == ptr || *endptr != ',') + { + skipped_lines++; + continue; + } + ptr = endptr + 1; + + /* parse num_entries */ + const uint64_t num_entries = strtoull(ptr, &endptr, 10); + if (endptr == ptr || *endptr != ',') + { + skipped_lines++; + continue; + } + ptr = endptr + 1; + + /* parse size_bytes */ + const uint64_t size_bytes = strtoull(ptr, &endptr, 10); + if (endptr == ptr) + { + skipped_lines++; + continue; + } + + tidesdb_manifest_add_sstable_unlocked(manifest, level, id, num_entries, size_bytes); + } + + /* surface silent data loss, malformed entry lines were dropped. this leaf module has + * no access to the db log, so a single stderr line is the best signal available. */ + if (skipped_lines > 0) + { + fprintf(stderr, "tidesdb manifest: skipped %d malformed entry line(s) while loading %s\n", + skipped_lines, manifest->path[0] ? manifest->path : "(unknown)"); + } + + /* we keep file open for future use */ + manifest->fp = fp; + + return manifest; +} + +/** + * tidesdb_manifest_add_sstable_unlocked + * adds an sstable to the manifest + * @param manifest manifest to add sstable to + * @param level level of sstable + * @param id id of sstable + * @param num_entries number of entries in sstable + * @param size_bytes size of sstable in bytes + * @return 0 on success, -1 on error + */ +static int tidesdb_manifest_add_sstable_unlocked(tidesdb_manifest_t *manifest, const int level, + const uint64_t id, const uint64_t num_entries, + const uint64_t size_bytes) +{ + for (int i = 0; i < manifest->num_entries; i++) + { + if (manifest->entries[i].level == level && manifest->entries[i].id == id) + { + manifest->entries[i].num_entries = num_entries; + manifest->entries[i].size_bytes = size_bytes; + return 0; + } + } + + if (manifest->num_entries >= manifest->capacity) + { + const int new_capacity = manifest->capacity * 2; + tidesdb_manifest_entry_t *new_entries = + realloc(manifest->entries, sizeof(tidesdb_manifest_entry_t) * new_capacity); + if (!new_entries) + { + return -1; + } + + manifest->entries = new_entries; + manifest->capacity = new_capacity; + } + + manifest->entries[manifest->num_entries].level = level; + manifest->entries[manifest->num_entries].id = id; + manifest->entries[manifest->num_entries].num_entries = num_entries; + manifest->entries[manifest->num_entries].size_bytes = size_bytes; + manifest->num_entries++; + + return 0; +} + +int tidesdb_manifest_add_sstable(tidesdb_manifest_t *manifest, const int level, const uint64_t id, + const uint64_t num_entries, const uint64_t size_bytes) +{ + if (!manifest) return -1; + + atomic_fetch_add(&manifest->active_ops, 1); + pthread_rwlock_wrlock(&manifest->lock); + const int result = + tidesdb_manifest_add_sstable_unlocked(manifest, level, id, num_entries, size_bytes); + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return result; +} + +int tidesdb_manifest_remove_sstable(tidesdb_manifest_t *manifest, const int level, + const uint64_t id) +{ + if (!manifest) return -1; + + atomic_fetch_add(&manifest->active_ops, 1); + pthread_rwlock_wrlock(&manifest->lock); + + for (int i = 0; i < manifest->num_entries; i++) + { + if (manifest->entries[i].level == level && manifest->entries[i].id == id) + { + /* we swap with last element for O(1) removal (order not required) */ + manifest->entries[i] = manifest->entries[manifest->num_entries - 1]; + manifest->num_entries--; + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return 0; + } + } + + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return -1; +} + +int tidesdb_manifest_has_sstable(tidesdb_manifest_t *manifest, const int level, const uint64_t id) +{ + if (!manifest) return 0; + + atomic_fetch_add(&manifest->active_ops, 1); + pthread_rwlock_rdlock(&manifest->lock); + + for (int i = 0; i < manifest->num_entries; i++) + { + if (manifest->entries[i].level == level && manifest->entries[i].id == id) + { + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return 1; + } + } + + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return 0; +} + +void tidesdb_manifest_update_sequence(tidesdb_manifest_t *manifest, uint64_t sequence) +{ + if (!manifest) return; + + /* monotonic guard, the sequence seeds next_sstable_id on recovery, so it must never + * regress or recovery would re-hand-out live sstable ids and collide. cas loop so a + * concurrent larger store is never clobbered by a smaller one. */ + uint64_t cur = atomic_load(&manifest->sequence); + while (sequence > cur && !atomic_compare_exchange_weak(&manifest->sequence, &cur, sequence)) + { + /* cur reloaded with the live value on failure; loop re-checks sequence > cur */ + } +} + +int tidesdb_manifest_commit(tidesdb_manifest_t *manifest, const char *path) +{ + if (!manifest || !path) return -1; + + atomic_fetch_add(&manifest->active_ops, 1); + pthread_rwlock_wrlock(&manifest->lock); + + /* we update stored path if it changed */ + if (strcmp(manifest->path, path) != 0) + { + strncpy(manifest->path, path, MANIFEST_PATH_LEN - 1); + manifest->path[MANIFEST_PATH_LEN - 1] = '\0'; + } + + if (manifest->fp) + { + fclose(manifest->fp); + manifest->fp = NULL; + } + + char temp_path[MANIFEST_PATH_LEN]; + snprintf(temp_path, sizeof(temp_path), "%s" MANIFEST_TMP_EXT "%lu.%d", path, + (unsigned long)TDB_THREAD_ID(), TDB_GETPID()); + + FILE *fp = tdb_fopen(temp_path, "w"); + if (!fp) + { + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return -1; + } + + fprintf(fp, "%d\n", MANIFEST_VERSION); + fprintf(fp, "%" PRIu64 "\n", atomic_load(&manifest->sequence)); + + for (int i = 0; i < manifest->num_entries; i++) + { + fprintf(fp, "%d,%" PRIu64 ",%" PRIu64 ",%" PRIu64 "\n", manifest->entries[i].level, + manifest->entries[i].id, manifest->entries[i].num_entries, + manifest->entries[i].size_bytes); + } + + if (fflush(fp) != 0) + { + fclose(fp); + remove(temp_path); + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return -1; + } + + const int fd = tdb_fileno(fp); + if (fd >= 0) + { + if (tdb_fsync(fd) != 0) + { + fclose(fp); + remove(temp_path); + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return -1; + } + } + + fclose(fp); + + /* atomic rename -- this is the commit point */ + if (atomic_rename_file(temp_path, path) != 0) + { + remove(temp_path); + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return -1; + } + + /* we sync the parent directory to ensure the rename is durable. + * without this, a crash after rename could lose the directory entry + * on POSIX systems that don't flush directory metadata automatically. */ + { + /* sized to the full manifest path length -- a 1024-byte buffer silently truncated + * paths > 1023 chars and synced the wrong directory */ + char dir_buf[MANIFEST_PATH_LEN]; + strncpy(dir_buf, path, sizeof(dir_buf) - 1); + dir_buf[sizeof(dir_buf) - 1] = '\0'; + char *last_sep = strrchr(dir_buf, '/'); +#ifdef _WIN32 + if (!last_sep) last_sep = strrchr(dir_buf, '\\'); +#endif + if (last_sep) + { + *last_sep = '\0'; + tdb_sync_directory(dir_buf); + } + } + + /* we reopen for reading */ + manifest->fp = tdb_fopen(path, "r"); + + pthread_rwlock_unlock(&manifest->lock); + atomic_fetch_sub(&manifest->active_ops, 1); + return 0; +} + +void tidesdb_manifest_close(tidesdb_manifest_t *manifest) +{ + if (!manifest) return; + + /* wait for all active operations to complete before destroying */ + int wait_count = 0; + while (atomic_load(&manifest->active_ops) > 0 && wait_count < MANIFEST_CLOSE_MAX_WAITS) + { + usleep(MANIFEST_CLOSE_WAIT_US); + wait_count++; + } + + pthread_rwlock_wrlock(&manifest->lock); + + if (manifest->fp) + { + fclose(manifest->fp); + manifest->fp = NULL; + } + + pthread_rwlock_unlock(&manifest->lock); + pthread_rwlock_destroy(&manifest->lock); + free(manifest->entries); + free(manifest); +} \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/manifest.h b/storage/tidesdb/libtidesdb/src/manifest.h new file mode 100644 index 0000000000000..63c195fcd64d9 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/manifest.h @@ -0,0 +1,138 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __MANIFEST_H__ +#define __MANIFEST_H__ + +#define MANIFEST_INITIAL_CAPACITY 64 +#define MANIFEST_VERSION 7 +#define MANIFEST_PATH_LEN 4096 +#define MANIFEST_MAX_LINE_LEN 256 +/* microseconds to wait between checks */ +#define MANIFEST_CLOSE_WAIT_US 100 +/* max iterations (10000 × 100μs = 1 second) */ +#define MANIFEST_CLOSE_MAX_WAITS 10000 + +#include "compat.h" + +/** + * tidesdb_manifest_entry_t + * represents a single sstable entry in the manifest + * @param level level number (1-based) + * @param id sstable ID + * @param num_entries number of entries in sstable + * @param size_bytes total size in bytes + */ +typedef struct +{ + int level; + uint64_t id; + uint64_t num_entries; + uint64_t size_bytes; +} tidesdb_manifest_entry_t; + +/** + * tidesdb_manifest_t + * in-memory representation of manifest file + * @param entries array of sstable entries + * @param num_entries number of entries + * @param capacity capacity of entries array + * @param sequence current global sequence number + * @param path path to manifest file + * @param fp file pointer (kept open for efficient commits) + * @param lock reader-writer lock for thread safety + * @param active_ops count of active operations (for safe shutdown) + */ +typedef struct +{ + tidesdb_manifest_entry_t *entries; + int num_entries; + int capacity; + _Atomic(uint64_t) sequence; + char path[MANIFEST_PATH_LEN]; + FILE *fp; + pthread_rwlock_t lock; + _Atomic(int) active_ops; +} tidesdb_manifest_t; + +/** + * tidesdb_manifest_open + * opens manifest from file, creating new if it doesn't exist + * @param path path to manifest file + * @return opened manifest or NULL on error + */ +tidesdb_manifest_t *tidesdb_manifest_open(const char *path); + +/** + * tidesdb_manifest_add_sstable + * adds an sstable entry to the manifest + * @param manifest manifest to modify + * @param level level number + * @param id sstable ID + * @param num_entries number of entries + * @param size_bytes size in bytes + * @return 0 on success, -1 on error + */ +int tidesdb_manifest_add_sstable(tidesdb_manifest_t *manifest, int level, uint64_t id, + uint64_t num_entries, uint64_t size_bytes); + +/** + * tidesdb_manifest_remove_sstable + * removes an sstable entry from the manifest + * @param manifest manifest to modify + * @param level level number + * @param id sstable ID + * @return 0 on success, -1 on error + */ +int tidesdb_manifest_remove_sstable(tidesdb_manifest_t *manifest, int level, uint64_t id); + +/** + * tidesdb_manifest_has_sstable + * checks if manifest contains an sstable + * @param manifest manifest to check + * @param level level number + * @param id sstable ID + * @return 1 if exists, 0 if not + */ +int tidesdb_manifest_has_sstable(tidesdb_manifest_t *manifest, int level, uint64_t id); + +/** + * tidesdb_manifest_update_sequence + * updates the global sequence number + * @param manifest manifest to modify + * @param sequence new sequence number + */ +void tidesdb_manifest_update_sequence(tidesdb_manifest_t *manifest, uint64_t sequence); + +/** + * tidesdb_manifest_commit + * updates manifest on disk + * @param manifest manifest to write + * @param path path to manifest file + * @return 0 on success, -1 on error + */ +int tidesdb_manifest_commit(tidesdb_manifest_t *manifest, const char *path); + +/** + * tidesdb_manifest_close + * closes manifest and frees memory + * @param manifest manifest to close + */ +void tidesdb_manifest_close(tidesdb_manifest_t *manifest); + +#endif /* __MANIFEST_H__ */ \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/objstore.h b/storage/tidesdb/libtidesdb/src/objstore.h new file mode 100644 index 0000000000000..b21bf058a2b67 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/objstore.h @@ -0,0 +1,212 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __OBJSTORE_H__ +#define __OBJSTORE_H__ + +#include "compat.h" + +/** + * tidesdb_objstore_backend_t + * identifies the object store backend in use. + * prevents misuse by restricting to known, supported backends. + */ +typedef enum +{ + TDB_BACKEND_FS = 0, /* filesystem connector (local/NFS, always available) */ + TDB_BACKEND_S3 = 1, /* S3-compatible (AWS S3, MinIO, requires TIDESDB_WITH_S3) */ + TDB_BACKEND_UNKNOWN = 99 +} tidesdb_objstore_backend_t; + +/** + * tidesdb_objstore_backend_name + * return a human-readable string for a backend enum value + * @param backend backend enum value + * @return static string (e.g. "fs", "s3", "unknown") + */ +static inline const char *tidesdb_objstore_backend_name(tidesdb_objstore_backend_t backend) +{ + switch (backend) + { + case TDB_BACKEND_FS: + return "fs"; + case TDB_BACKEND_S3: + return "s3"; + default: + return "unknown"; + } +} + +/** + * tidesdb_objstore_t + * pluggable object store connector interface. + * each function receives the opaque ctx pointer set at registration. + * object keys are path-like strings (e.g. "cf_name/L1_100.klog"). + * connectors must be thread-safe -- multiple threads may call concurrently. + * @param backend identifies the object store backend + * @param put function pointer to upload an object from a local file + * @param get function pointer to download an object to a local file + * @param range_get function pointer to download a byte range into a buffer + * @param delete_object function pointer to delete an object + * @param exists function pointer to check if an object exists + * @param list function pointer to enumerate objects under a prefix + * @param destroy function pointer to free connector resources + * @param ctx opaque connector context (client handle, credentials, etc.) + */ +typedef struct +{ + tidesdb_objstore_backend_t backend; /* identifies the object store backend */ + + /** + * put -- upload an object from a local file path. + * the connector reads the file and uploads it as an atomic object. + * @param ctx opaque connector context + * @param key object key (path-like, e.g. "cf/L1_5.klog") + * @param local_path path to the local file to upload + * @return 0 on success, -1 on error + */ + int (*put)(void *ctx, const char *key, const char *local_path); + + /** + * get -- download an object to a local file path. + * the connector creates intermediate directories as needed. + * @param ctx opaque connector context + * @param key object key + * @param local_path path to write the downloaded file + * @return 0 on success, -1 on error (including not found) + */ + int (*get)(void *ctx, const char *key, const char *local_path); + + /** + * range_get -- download a byte range of an object into a buffer. + * used for fetching individual blocks without downloading the full file. + * @param ctx opaque connector context + * @param key object key + * @param offset byte offset to start reading + * @param buf output buffer (caller allocated) + * @param size number of bytes to read + * @return bytes read on success, -1 on error + */ + ssize_t (*range_get)(void *ctx, const char *key, uint64_t offset, void *buf, size_t size); + + /** + * delete_object -- delete an object. + * not-found is not an error. + * @param ctx opaque connector context + * @param key object key + * @return 0 on success, -1 on error + */ + int (*delete_object)(void *ctx, const char *key); + + /** + * exists -- check if an object exists and optionally return its size. + * @param ctx opaque connector context + * @param key object key + * @param size_out if non-NULL, receives the object size in bytes + * @return 1 if exists, 0 if not, -1 on error + */ + int (*exists)(void *ctx, const char *key, size_t *size_out); + + /** + * list -- enumerate objects under a key prefix. + * calls the callback for each object found. + * @param ctx opaque connector context + * @param prefix key prefix to list (e.g. "cf/") + * @param cb callback invoked for each object (key, size, cb_ctx) + * @param cb_ctx opaque context passed to callback + * @return number of objects listed, -1 on error + */ + int (*list)(void *ctx, const char *prefix, + void (*cb)(const char *key, size_t size, void *cb_ctx), void *cb_ctx); + + /** + * destroy -- free connector resources. + * called during tidesdb_close. + * @param ctx opaque connector context + */ + void (*destroy)(void *ctx); + + void *ctx; /* opaque connector context (client handle, credentials, etc.) */ +} tidesdb_objstore_t; + +/** + * tidesdb_objstore_config_t + * configuration for object store mode behavior. + * passed to tidesdb_config_t.object_store_config. + * NULL means use defaults. + * @param local_cache_path local directory for cached sstable files (NULL = use db_path) + * @param local_cache_max_bytes maximum cache size in bytes (0 = unlimited) + * @param cache_on_read whether to cache downloaded files locally (default 1) + * @param cache_on_write whether to keep local copy after upload (default 1) + * @param max_concurrent_uploads number of parallel upload threads (default 4) + * @param max_concurrent_downloads number of parallel download threads (default 8) + * @param multipart_threshold byte threshold above which multipart upload is used (default 64MB) + * @param multipart_part_size chunk size for multipart uploads (default 8MB) + * @param sync_manifest_to_object whether to upload MANIFEST after each compaction (default 1) + * @param replicate_wal whether to upload closed WAL segments (default 1) + * @param wal_upload_sync 0 for background WAL upload (default), 1 to block flush + * @param wal_sync_threshold_bytes sync active WAL to object store when it grows by this many bytes + * since the last sync (default 1MB, 0 = disable periodic WAL sync). uses the block manager + * atomic file size for lock-free detection. the reaper thread checks every cycle (~100ms) + * and uploads when the threshold is exceeded, bounding the data loss window to the + * write volume rather than wall clock time + * @param wal_sync_on_commit upload WAL after every txn commit for RPO=0 replication (default 0) + * @param replica_mode enable read-only replica mode (default 0) + * @param replica_sync_interval_us MANIFEST poll interval in microseconds (default 5000000) + * @param replica_replay_wal replay WAL for near-real-time reads on replicas (default 1) + */ +typedef struct +{ + const char *local_cache_path; + size_t local_cache_max_bytes; + int cache_on_read; + int cache_on_write; + int max_concurrent_uploads; + int max_concurrent_downloads; + size_t multipart_threshold; + size_t multipart_part_size; + int sync_manifest_to_object; + int replicate_wal; + int wal_upload_sync; + size_t wal_sync_threshold_bytes; + int wal_sync_on_commit; + int replica_mode; + uint64_t replica_sync_interval_us; + int replica_replay_wal; +} tidesdb_objstore_config_t; + +/** + * tidesdb_objstore_default_config + * @return default object store configuration + */ +tidesdb_objstore_config_t tidesdb_objstore_default_config(void); + +/** + * tidesdb_objstore_fs_create + * create a filesystem-backed connector (for testing and local replication). + * stores objects as files under root_dir, mirroring the key path structure. + * @param root_dir directory to store objects in + * @return connector handle, or NULL on error. caller must eventually call destroy. + */ +tidesdb_objstore_t *tidesdb_objstore_fs_create(const char *root_dir); + +#ifdef TIDESDB_WITH_S3 +#include "objstore_s3.h" +#endif + +#endif /* __OBJSTORE_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/objstore_fs.c b/storage/tidesdb/libtidesdb/src/objstore_fs.c new file mode 100644 index 0000000000000..2be53c047322f --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/objstore_fs.c @@ -0,0 +1,541 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include "objstore.h" + +#ifndef _WIN32 +#include +#include +#else +#include +#include +#endif + +#define TDB_FS_MAX_PATH 4096 +#define TDB_FS_COPY_BUF 65536 +#define TDB_FS_DIR_MODE 0755 +/* extra bytes reserved for the ".tmp.." suffix on the atomic-put temp path */ +#define TDB_FS_TMP_SUFFIX_MAX 64 + +/* default object store config values */ +#define TDB_OBJSTORE_DEFAULT_CACHE_ON_READ 1 +#define TDB_OBJSTORE_DEFAULT_CACHE_ON_WRITE 1 +#define TDB_OBJSTORE_DEFAULT_MAX_UPLOADS 4 +#define TDB_OBJSTORE_DEFAULT_MAX_DOWNLOADS 8 +#define TDB_OBJSTORE_DEFAULT_MULTIPART_THRESHOLD (64 * 1024 * 1024) +#define TDB_OBJSTORE_DEFAULT_MULTIPART_PART_SIZE (8 * 1024 * 1024) +#define TDB_OBJSTORE_DEFAULT_SYNC_MANIFEST 1 +#define TDB_OBJSTORE_DEFAULT_REPLICATE_WAL 1 +#define TDB_OBJSTORE_DEFAULT_WAL_UPLOAD_SYNC 0 +#define TDB_OBJSTORE_DEFAULT_WAL_SYNC_THRESHOLD (1024 * 1024) /* 1MB */ +#define TDB_OBJSTORE_DEFAULT_WAL_SYNC_ON_COMMIT 0 +#define TDB_OBJSTORE_DEFAULT_REPLICA_MODE 0 +#define TDB_OBJSTORE_DEFAULT_REPLICA_SYNC_INTERVAL 5000000 /* 5 seconds */ +#define TDB_OBJSTORE_DEFAULT_REPLICA_REPLAY_WAL 1 + +/** + * fs_ctx_t + * internal context for the filesystem connector + * @param root_dir root directory where objects are stored as files + */ +typedef struct +{ + char root_dir[TDB_FS_MAX_PATH]; +} fs_ctx_t; + +/** + * fs_mkdir_p + * create all intermediate directories for a file path + * @param file_path path to a file whose parent directories should be created + */ +static void fs_mkdir_p(const char *file_path) +{ + char tmp[TDB_FS_MAX_PATH]; + snprintf(tmp, sizeof(tmp), "%s", file_path); + + /* we find last separator to get directory portion */ + char *last_sep = strrchr(tmp, '/'); +#ifdef _WIN32 + char *last_bsep = strrchr(tmp, '\\'); + if (last_bsep && (!last_sep || last_bsep > last_sep)) last_sep = last_bsep; +#endif + if (!last_sep) return; + *last_sep = '\0'; + + /* we create each directory component */ + for (char *p = tmp + 1; *p; p++) + { + if (*p == '/' +#ifdef _WIN32 + || *p == '\\' +#endif + ) + { + *p = '\0'; +#ifdef _WIN32 + _mkdir(tmp); +#else + mkdir(tmp, TDB_FS_DIR_MODE); +#endif + *p = '/'; + } + } +#ifdef _WIN32 + _mkdir(tmp); +#else + mkdir(tmp, TDB_FS_DIR_MODE); +#endif +} + +/** + * fs_full_path + * build full path by joining root_dir and key + * @param ctx filesystem connector context + * @param key object key (relative path) + * @param out output buffer for the full path + * @param out_size size of the output buffer + */ +static void fs_full_path(const fs_ctx_t *ctx, const char *key, char *out, size_t out_size) +{ + snprintf(out, out_size, "%s/%s", ctx->root_dir, key); +} + +/** + * fs_copy_file + * copy file contents from src_path to dst_path + * @param src_path source file path + * @param dst_path destination file path (parent dirs created if needed) + * @return 0 on success, -1 on error + */ +static int fs_copy_file(const char *src_path, const char *dst_path) +{ + FILE *src = fopen(src_path, "rb"); + if (!src) return -1; + + fs_mkdir_p(dst_path); + + FILE *dst = fopen(dst_path, "wb"); + if (!dst) + { + fclose(src); + return -1; + } + + char buf[TDB_FS_COPY_BUF]; + size_t n; + int rc = 0; + while ((n = fread(buf, 1, sizeof(buf), src)) > 0) + { + if (fwrite(buf, 1, n, dst) != n) + { + rc = -1; + break; + } + } + if (ferror(src)) rc = -1; + + fclose(dst); + fclose(src); + + /** we remove partial destination file on failure so stale corrupt files + * do not prevent re-download on subsequent attempts */ + if (rc != 0) unlink(dst_path); + + return rc; +} + +/** + * fs_put + * upload a local file as an object by copying it to the root directory + * @param ctx opaque connector context + * @param key object key (relative path) + * @param local_path local file to upload + * @return 0 on success, -1 on error + */ +static int fs_put(void *ctx, const char *key, const char *local_path) +{ + fs_ctx_t *fs = (fs_ctx_t *)ctx; + char full[TDB_FS_MAX_PATH * 2]; + fs_full_path(fs, key, full, sizeof(full)); + + /* copy to a unique temp file then atomically rename into place, so a concurrent + * reader/list never observes a partially-written object -- the objstore put contract + * (objstore.h) is "atomic object". the temp lives in the same directory as the target + * so the rename stays within one filesystem. */ + char tmp[TDB_FS_MAX_PATH * 2 + TDB_FS_TMP_SUFFIX_MAX]; + snprintf(tmp, sizeof(tmp), "%s.tmp.%ld.%lu", full, (long)TDB_GETPID(), TDB_THREAD_ID()); + + if (fs_copy_file(local_path, tmp) != 0) return -1; + + if (atomic_rename_file(tmp, full) != 0) + { + unlink(tmp); + return -1; + } + return 0; +} + +/** + * fs_get + * download an object to a local file by copying from the root directory + * @param ctx opaque connector context + * @param key object key (relative path) + * @param local_path local path to write the downloaded file + * @return 0 on success, -1 on error (including not found) + */ +static int fs_get(void *ctx, const char *key, const char *local_path) +{ + fs_ctx_t *fs = (fs_ctx_t *)ctx; + char full[TDB_FS_MAX_PATH * 2]; + fs_full_path(fs, key, full, sizeof(full)); + return fs_copy_file(full, local_path); +} + +/** + * fs_range_get + * read a byte range from an object file into a buffer + * @param ctx opaque connector context + * @param key object key (relative path) + * @param offset byte offset to start reading + * @param buf output buffer (caller allocated) + * @param size number of bytes to read + * @return bytes read on success, -1 on error + */ +static ssize_t fs_range_get(void *ctx, const char *key, uint64_t offset, void *buf, size_t size) +{ + fs_ctx_t *fs = (fs_ctx_t *)ctx; + char full[TDB_FS_MAX_PATH * 2]; + fs_full_path(fs, key, full, sizeof(full)); + + int fd = open(full, O_RDONLY, 0); + if (fd < 0) return -1; + + ssize_t nread = pread(fd, buf, size, (off_t)offset); + close(fd); + return nread; +} + +/** + * fs_delete_object + * delete an object file. not-found is not an error. + * @param ctx opaque connector context + * @param key object key (relative path) + * @return 0 on success, -1 on error + */ +static int fs_delete_object(void *ctx, const char *key) +{ + fs_ctx_t *fs = (fs_ctx_t *)ctx; + char full[TDB_FS_MAX_PATH * 2]; + fs_full_path(fs, key, full, sizeof(full)); + +#ifdef _WIN32 + _unlink(full); +#else + unlink(full); +#endif + return 0; +} + +/** + * fs_exists + * check if an object file exists and optionally return its size + * @param ctx opaque connector context + * @param key object key (relative path) + * @param size_out if non-NULL, receives the file size in bytes + * @return 1 if exists, 0 if not, -1 on error + */ +static int fs_exists(void *ctx, const char *key, size_t *size_out) +{ + fs_ctx_t *fs = (fs_ctx_t *)ctx; + char full[TDB_FS_MAX_PATH * 2]; + fs_full_path(fs, key, full, sizeof(full)); + + struct stat st; + if (stat(full, &st) != 0) + { + if (errno == ENOENT) return 0; + return -1; + } + + if (size_out) *size_out = (size_t)st.st_size; + return 1; +} + +/** + * fs_list_walk + * recursively walk abs_dir and invoke cb for each regular file whose + * relative key starts with prefix. subdirectories whose relative path + * already diverges from prefix are not descended into. + * @param abs_dir absolute filesystem path of the directory to walk + * @param rel_dir relative key path of abs_dir within the store ("" at root) + * @param rel_dir_len cached strlen(rel_dir) + * @param prefix target key prefix + * @param prefix_len cached strlen(prefix) + * @param cb callback invoked for each matching file (key, size, cb_ctx) + * @param cb_ctx opaque context passed to callback + * @param count running count of objects emitted + * @return updated count + */ +static int fs_list_walk(const char *abs_dir, const char *rel_dir, size_t rel_dir_len, + const char *prefix, size_t prefix_len, + void (*cb)(const char *key, size_t size, void *cb_ctx), void *cb_ctx, + int count) +{ +#ifdef _WIN32 + char pattern[TDB_FS_MAX_PATH * 2]; + snprintf(pattern, sizeof(pattern), "%s\\*", abs_dir); + + struct _finddata_t fd; + intptr_t handle = _findfirst(pattern, &fd); + if (handle == -1) return count; + + do + { + if (fd.name[0] == '.' && (fd.name[1] == '\0' || (fd.name[1] == '.' && fd.name[2] == '\0'))) + continue; + + char child_rel[TDB_FS_MAX_PATH]; + int n = (rel_dir_len == 0) + ? snprintf(child_rel, sizeof(child_rel), "%s", fd.name) + : snprintf(child_rel, sizeof(child_rel), "%s/%s", rel_dir, fd.name); + if (n < 0 || (size_t)n >= sizeof(child_rel)) continue; + size_t child_rel_len = (size_t)n; + + if (fd.attrib & _A_SUBDIR) + { + size_t cmp = child_rel_len < prefix_len ? child_rel_len : prefix_len; + if (cmp && strncmp(child_rel, prefix, cmp) != 0) continue; + + char child_abs[TDB_FS_MAX_PATH * 2]; + snprintf(child_abs, sizeof(child_abs), "%s\\%s", abs_dir, fd.name); + count = fs_list_walk(child_abs, child_rel, child_rel_len, prefix, prefix_len, cb, + cb_ctx, count); + continue; + } + + if (prefix_len != 0 && + (child_rel_len < prefix_len || strncmp(child_rel, prefix, prefix_len) != 0)) + continue; + + cb(child_rel, (size_t)fd.size, cb_ctx); + count++; + } while (_findnext(handle, &fd) == 0); + + _findclose(handle); +#else + DIR *d = opendir(abs_dir); + if (!d) return count; + + struct dirent *ent; + while ((ent = readdir(d)) != NULL) + { + if (ent->d_name[0] == '.' && + (ent->d_name[1] == '\0' || (ent->d_name[1] == '.' && ent->d_name[2] == '\0'))) + continue; + + char child_rel[TDB_FS_MAX_PATH]; + int n = (rel_dir_len == 0) + ? snprintf(child_rel, sizeof(child_rel), "%s", ent->d_name) + : snprintf(child_rel, sizeof(child_rel), "%s/%s", rel_dir, ent->d_name); + if (n < 0 || (size_t)n >= sizeof(child_rel)) continue; + size_t child_rel_len = (size_t)n; + + /* prefer dirent::d_type; fall back to stat() only when the FS reports DT_UNKNOWN */ + int is_dir = 0, is_reg = 0; +#ifdef DT_DIR + if (ent->d_type == DT_DIR) + is_dir = 1; + else if (ent->d_type == DT_REG) + is_reg = 1; + else if (ent->d_type != DT_UNKNOWN) + continue; + else +#endif + { + char child_abs[TDB_FS_MAX_PATH * 2]; + snprintf(child_abs, sizeof(child_abs), "%s/%s", abs_dir, ent->d_name); + struct stat st; + if (stat(child_abs, &st) != 0) continue; + if (S_ISDIR(st.st_mode)) + is_dir = 1; + else if (S_ISREG(st.st_mode)) + is_reg = 1; + else + continue; + } + + if (is_dir) + { + size_t cmp = child_rel_len < prefix_len ? child_rel_len : prefix_len; + if (cmp && strncmp(child_rel, prefix, cmp) != 0) continue; + + char child_abs[TDB_FS_MAX_PATH * 2]; + snprintf(child_abs, sizeof(child_abs), "%s/%s", abs_dir, ent->d_name); + count = fs_list_walk(child_abs, child_rel, child_rel_len, prefix, prefix_len, cb, + cb_ctx, count); + continue; + } + + if (!is_reg) continue; + + if (prefix_len != 0 && + (child_rel_len < prefix_len || strncmp(child_rel, prefix, prefix_len) != 0)) + continue; + + char child_abs[TDB_FS_MAX_PATH * 2]; + snprintf(child_abs, sizeof(child_abs), "%s/%s", abs_dir, ent->d_name); + struct stat st; + if (stat(child_abs, &st) != 0) continue; + cb(child_rel, (size_t)st.st_size, cb_ctx); + count++; + } + + closedir(d); +#endif + return count; +} + +/** + * fs_list + * enumerate all objects whose key starts with prefix. matches S3 + * ListObjectsV2(prefix=...) semantics, the prefix is matched byte-wise + * against the key and need not align to a directory boundary. + * @param ctx opaque connector context + * @param prefix key prefix to list (e.g. "cf_name/" or "uwal_") + * @param cb callback invoked for each object (key, size, cb_ctx) + * @param cb_ctx opaque context passed to callback + * @return number of objects listed, -1 on error + */ +static int fs_list(void *ctx, const char *prefix, + void (*cb)(const char *key, size_t size, void *cb_ctx), void *cb_ctx) +{ + fs_ctx_t *fs = (fs_ctx_t *)ctx; + + /* descend straight to the deepest directory component embedded in prefix + * so we don't walk ancestors that cannot contain a matching key */ + const char *last_sep = strrchr(prefix, '/'); +#ifdef _WIN32 + { + const char *bs = strrchr(prefix, '\\'); + if (bs && (!last_sep || bs > last_sep)) last_sep = bs; + } +#endif + + char start_abs[TDB_FS_MAX_PATH * 2]; + char start_rel[TDB_FS_MAX_PATH]; + size_t start_rel_len = 0; + if (last_sep && last_sep != prefix) + { + size_t dir_len = (size_t)(last_sep - prefix); + snprintf(start_abs, sizeof(start_abs), "%s/%.*s", fs->root_dir, (int)dir_len, prefix); + snprintf(start_rel, sizeof(start_rel), "%.*s", (int)dir_len, prefix); + start_rel_len = dir_len; + } + else + { + snprintf(start_abs, sizeof(start_abs), "%s", fs->root_dir); + start_rel[0] = '\0'; + } + + return fs_list_walk(start_abs, start_rel, start_rel_len, prefix, strlen(prefix), cb, cb_ctx, 0); +} + +/** + * fs_destroy + * free connector resources + * @param ctx opaque connector context + */ +static void fs_destroy(void *ctx) +{ + free(ctx); +} + +/** + * tidesdb_objstore_default_config + * return default object store configuration with sensible defaults + * @return default tidesdb_objstore_config_t struct + */ +tidesdb_objstore_config_t tidesdb_objstore_default_config(void) +{ + return (tidesdb_objstore_config_t){ + .local_cache_path = NULL, + .local_cache_max_bytes = 0, + .cache_on_read = TDB_OBJSTORE_DEFAULT_CACHE_ON_READ, + .cache_on_write = TDB_OBJSTORE_DEFAULT_CACHE_ON_WRITE, + .max_concurrent_uploads = TDB_OBJSTORE_DEFAULT_MAX_UPLOADS, + .max_concurrent_downloads = TDB_OBJSTORE_DEFAULT_MAX_DOWNLOADS, + .multipart_threshold = TDB_OBJSTORE_DEFAULT_MULTIPART_THRESHOLD, + .multipart_part_size = TDB_OBJSTORE_DEFAULT_MULTIPART_PART_SIZE, + .sync_manifest_to_object = TDB_OBJSTORE_DEFAULT_SYNC_MANIFEST, + .replicate_wal = TDB_OBJSTORE_DEFAULT_REPLICATE_WAL, + .wal_upload_sync = TDB_OBJSTORE_DEFAULT_WAL_UPLOAD_SYNC, + .wal_sync_threshold_bytes = TDB_OBJSTORE_DEFAULT_WAL_SYNC_THRESHOLD, + .wal_sync_on_commit = TDB_OBJSTORE_DEFAULT_WAL_SYNC_ON_COMMIT, + .replica_mode = TDB_OBJSTORE_DEFAULT_REPLICA_MODE, + .replica_sync_interval_us = TDB_OBJSTORE_DEFAULT_REPLICA_SYNC_INTERVAL, + .replica_replay_wal = TDB_OBJSTORE_DEFAULT_REPLICA_REPLAY_WAL, + }; +} + +/** + * tidesdb_objstore_fs_create + * create a filesystem-backed connector (for testing and local replication). + * stores objects as files under root_dir, mirroring the key path structure. + * @param root_dir directory to store objects in + * @return connector handle, or NULL on error. caller must eventually call destroy. + */ +tidesdb_objstore_t *tidesdb_objstore_fs_create(const char *root_dir) +{ + if (!root_dir) return NULL; + + fs_ctx_t *fs = calloc(1, sizeof(fs_ctx_t)); + if (!fs) return NULL; + + snprintf(fs->root_dir, sizeof(fs->root_dir), "%s", root_dir); + + /* we create root directory if it does not exist */ +#ifdef _WIN32 + _mkdir(root_dir); +#else + mkdir(root_dir, TDB_FS_DIR_MODE); +#endif + + tidesdb_objstore_t *store = calloc(1, sizeof(tidesdb_objstore_t)); + if (!store) + { + free(fs); + return NULL; + } + + store->backend = TDB_BACKEND_FS; + store->put = fs_put; + store->get = fs_get; + store->range_get = fs_range_get; + store->delete_object = fs_delete_object; + store->exists = fs_exists; + store->list = fs_list; + store->destroy = fs_destroy; + store->ctx = fs; + + return store; +} diff --git a/storage/tidesdb/libtidesdb/src/objstore_s3.c b/storage/tidesdb/libtidesdb/src/objstore_s3.c new file mode 100644 index 0000000000000..41e5236fe4a42 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/objstore_s3.c @@ -0,0 +1,1643 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef TIDESDB_WITH_S3 + +#include "objstore_s3.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +/* path and buffer size constants */ +#define TDB_S3_MAX_PATH 8192 +#define TDB_S3_MAX_HEADER 2048 +#define TDB_S3_DATE_LEN 9 /* YYYYMMDD + NUL */ +#define TDB_S3_TIMESTAMP_LEN 17 /* YYYYMMDDTHHMMSSZ + NUL */ +#define TDB_S3_HASH_HEX_LEN 65 /* SHA256 hex + NUL */ +#define TDB_S3_SHA256_DIGEST 32 /* SHA256 raw digest bytes */ +#define TDB_S3_DIR_MODE 0755 + +/* context struct buffer sizes */ +#define TDB_S3_ENDPOINT_MAX 512 +#define TDB_S3_BUCKET_MAX 256 +#define TDB_S3_PREFIX_MAX 512 +#define TDB_S3_KEY_MAX 128 +#define TDB_S3_REGION_MAX 64 + +/* HTTP status codes */ +#define TDB_S3_HTTP_OK 200 +#define TDB_S3_HTTP_PARTIAL 206 +#define TDB_S3_HTTP_REDIRECT 300 +#define TDB_S3_HTTP_NOT_FOUND 404 + +/* signing and response buffers. host and key_date buffers must be large + * enough for concatenated bucket+endpoint or "AWS4"+secret_key strings. */ +#define TDB_S3_SCOPE_BUF 128 +#define TDB_S3_STS_BUF 512 +#define TDB_S3_HOST_BUF 1024 +#define TDB_S3_RESPONSE_INIT 4096 +#define TDB_S3_CONT_TOKEN_MAX 1024 +#define TDB_S3_XML_TAG_BUF 128 +#define TDB_S3_SIZE_BUF 32 +#define TDB_S3_KEY_DATE_BUF 256 + +/* default region when none specified */ +#define TDB_S3_DEFAULT_REGION "us-east-1" + +/* network timeouts -- bound a hung connection so a dead or unreachable + * endpoint cannot block an upload worker, or a wal_sync_on_commit commit, + * forever. a hard total timeout is avoided so a legitimately slow large + * upload is not cut off; instead a stalled-transfer detector is used. */ +#define TDB_S3_CONNECT_TIMEOUT_S 15 +#define TDB_S3_LOW_SPEED_LIMIT 1 /* bytes per second */ +#define TDB_S3_LOW_SPEED_TIME_S 60 /* abort a transfer stalled below the limit this long */ + +/* multipart upload -- objects at or above the threshold are uploaded in + * parts so the connector never buffers a whole large file in memory and is + * not bound by S3's 5 GiB single-PUT limit. S3 requires parts of at least + * 5 MiB (the final part may be smaller) and at most 10000 parts. + * these match the documented objstore_config defaults (threshold 64 MiB, + * part size 8 MiB); honoring per-config overrides at runtime additionally + * requires plumbing multipart_threshold / multipart_part_size through the + * public tidesdb_objstore_s3_create signature (deferred -- API change). */ +#define TDB_S3_MULTIPART_THRESHOLD ((size_t)64 * 1024 * 1024) +#define TDB_S3_MULTIPART_PART_SIZE ((size_t)8 * 1024 * 1024) +#define TDB_S3_MAX_PARTS 10000 +#define TDB_S3_ETAG_MAX 128 +#define TDB_S3_UPLOAD_ID_MAX 512 + +/** + * s3_uri_encode + * URI-encode a string per the SigV4 spec. encodes all bytes except unreserved + * characters (A-Z, a-z, 0-9, '-', '.', '_', '~'). forward slashes are encoded + * as %2F since this is used for query parameter values, not object key paths. + * @param src input string + * @param dst output buffer + * @param dst_size size of output buffer + */ +static void s3_uri_encode(const char *src, char *dst, size_t dst_size) +{ + static const char *unreserved = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"; + size_t pos = 0; + for (; *src && pos + 3 < dst_size; src++) + { + if (strchr(unreserved, *src)) + { + dst[pos++] = *src; + } + else + { + snprintf(dst + pos, dst_size - pos, "%%%02X", (unsigned char)*src); + pos += 3; + } + } + dst[pos] = '\0'; +} + +/** + * s3_uri_encode_path + * URI-encode an object key for use as a request path / SigV4 canonical URI. like + * s3_uri_encode but leaves '/' unencoded so path segments are preserved. the request + * URL (s3_build_url) and the canonical URI (s3_sign_request) MUST apply the exact same + * encoding or the SigV4 signature will not match the request. for keys made only of + * unreserved characters and '/' (which is what tidesdb cf/sstable keys are) this is a + * passthrough, so normal operation is unchanged; it only matters for keys containing + * spaces, '+', '?', '#', '&', etc. + * @param src input key + * @param dst output buffer + * @param dst_size size of output buffer + */ +static void s3_uri_encode_path(const char *src, char *dst, size_t dst_size) +{ + static const char *unreserved = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~/"; + size_t pos = 0; + for (; *src && pos + 3 < dst_size; src++) + { + if (strchr(unreserved, *src)) + { + dst[pos++] = *src; + } + else + { + snprintf(dst + pos, dst_size - pos, "%%%02X", (unsigned char)*src); + pos += 3; + } + } + dst[pos] = '\0'; +} + +/** + * s3_ctx_t + * internal context for the S3 connector, credentials, endpoint, TLS, and multipart config. + * defined before s3_curl_new so that helper can apply the per-connector TLS options. + * @param endpoint S3 endpoint hostname + * @param bucket S3 bucket name + * @param prefix key prefix prepended to all object keys + * @param access_key AWS access key ID + * @param secret_key AWS secret access key + * @param region AWS region string + * @param use_ssl 1 for HTTPS, 0 for HTTP + * @param use_path_style 1 for path-style URLs, 0 for virtual-hosted + * @param tls_ca_path custom CA bundle file path (empty = libcurl default bundle) + * @param tls_insecure_skip_verify 1 disables peer+host verification (test endpoints only) + * @param multipart_threshold object size at/above which multipart upload is used + * @param multipart_part_size multipart chunk size + */ +typedef struct +{ + char endpoint[TDB_S3_ENDPOINT_MAX]; + char bucket[TDB_S3_BUCKET_MAX]; + char prefix[TDB_S3_PREFIX_MAX]; + char access_key[TDB_S3_KEY_MAX]; + char secret_key[TDB_S3_KEY_MAX]; + char region[TDB_S3_REGION_MAX]; + int use_ssl; + int use_path_style; + char tls_ca_path[TDB_S3_MAX_PATH]; + int tls_insecure_skip_verify; + size_t multipart_threshold; + size_t multipart_part_size; +} s3_ctx_t; + +/** + * s3_curl_new + * create a curl easy handle with the connector's common options applied -- a connection + * timeout and a stalled-transfer timeout so a dead endpoint cannot hang a worker, NOSIGNAL + * for safe use from multiple threads, and (over https) the connector's TLS settings, a custom + * CA bundle when configured, and an opt-in insecure skip-verify for test endpoints. + * @param s3 connector context (for TLS settings) + * @return a configured handle, or NULL on allocation failure + */ +static CURL *s3_curl_new(const s3_ctx_t *s3) +{ + CURL *curl = curl_easy_init(); + if (!curl) return NULL; + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, (long)TDB_S3_CONNECT_TIMEOUT_S); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, (long)TDB_S3_LOW_SPEED_LIMIT); + curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, (long)TDB_S3_LOW_SPEED_TIME_S); + + /* TLS only matters over https. leaving both branches untouched keeps libcurl's secure + * defaults (verify peer + host against the system CA bundle). */ + if (s3 && s3->use_ssl) + { + if (s3->tls_ca_path[0]) curl_easy_setopt(curl, CURLOPT_CAINFO, s3->tls_ca_path); + if (s3->tls_insecure_skip_verify) + { + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L); + } + } + return curl; +} + +/** + * sha256_hex + * compute SHA256 hash and output as lowercase hex string + * @param data input data + * @param len length of input data + * @param hex_out output buffer (must be at least TDB_S3_HASH_HEX_LEN bytes) + */ +static void sha256_hex(const void *data, size_t len, char *hex_out) +{ + unsigned char hash[TDB_S3_SHA256_DIGEST]; + EVP_MD_CTX *ctx = EVP_MD_CTX_new(); + EVP_DigestInit_ex(ctx, EVP_sha256(), NULL); + EVP_DigestUpdate(ctx, data, len); + EVP_DigestFinal_ex(ctx, hash, NULL); + EVP_MD_CTX_free(ctx); + for (int i = 0; i < 32; i++) sprintf(hex_out + i * 2, "%02x", hash[i]); + hex_out[64] = '\0'; +} + +/** + * hmac_sha256 + * compute HMAC-SHA256 + * @param key HMAC key + * @param key_len length of key + * @param data input data + * @param data_len length of data + * @param out output buffer (TDB_S3_SHA256_DIGEST bytes) + * @param out_len receives the output length + */ +static void hmac_sha256(const void *key, size_t key_len, const void *data, size_t data_len, + unsigned char *out, unsigned int *out_len) +{ + HMAC(EVP_sha256(), key, (int)key_len, (const unsigned char *)data, data_len, out, out_len); +} + +/** + * s3_get_timestamp + * get current UTC time in AWS SigV4 date and timestamp formats + * @param date8 output YYYYMMDD (TDB_S3_DATE_LEN bytes) + * @param timestamp16 output YYYYMMDDTHHMMSSZ (TDB_S3_TIMESTAMP_LEN bytes) + */ +static void s3_get_timestamp(char *date8, char *timestamp16) +{ + time_t now = time(NULL); + struct tm gm; + tdb_gmtime_r(&now, &gm); + strftime(date8, TDB_S3_DATE_LEN, "%Y%m%d", &gm); + strftime(timestamp16, TDB_S3_TIMESTAMP_LEN, "%Y%m%dT%H%M%SZ", &gm); +} + +/** + * s3_signing_key + * derive the SigV4 signing key via HMAC chain date -> region -> service -> request + * @param secret_key AWS secret access key + * @param date8 date string YYYYMMDD + * @param region AWS region + * @param out output signing key (TDB_S3_SHA256_DIGEST bytes) + * @param out_len receives the output length + */ +static void s3_signing_key(const char *secret_key, const char *date8, const char *region, + unsigned char *out, unsigned int *out_len) +{ + char key_date[TDB_S3_KEY_DATE_BUF]; + snprintf(key_date, sizeof(key_date), "AWS4%s", secret_key); + + unsigned char k1[TDB_S3_SHA256_DIGEST], k2[TDB_S3_SHA256_DIGEST], k3[TDB_S3_SHA256_DIGEST]; + unsigned int l; + hmac_sha256(key_date, strlen(key_date), date8, strlen(date8), k1, &l); + hmac_sha256(k1, l, region, strlen(region), k2, &l); + hmac_sha256(k2, l, "s3", 2, k3, &l); + hmac_sha256(k3, l, "aws4_request", 12, out, out_len); +} + +/** + * s3_build_url + * construct the full URL for an S3 object request + * @param ctx S3 connector context + * @param key object key + * @param url output URL buffer + * @param url_size size of the URL buffer + */ +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +static void s3_build_url(const s3_ctx_t *ctx, const char *key, char *url, size_t url_size) +{ + const char *scheme = ctx->use_ssl ? "https" : "http"; + char full_key[TDB_S3_MAX_PATH]; + if (ctx->prefix[0]) + snprintf(full_key, sizeof(full_key), "%s%s", ctx->prefix, key); + else + snprintf(full_key, sizeof(full_key), "%s", key); + + /* must match the canonical-URI encoding in s3_sign_request exactly */ + char enc_key[TDB_S3_MAX_PATH * 3]; + s3_uri_encode_path(full_key, enc_key, sizeof(enc_key)); + + if (ctx->use_path_style) + snprintf(url, url_size, "%s://%s/%s/%s", scheme, ctx->endpoint, ctx->bucket, enc_key); + else + snprintf(url, url_size, "%s://%s.%s/%s", scheme, ctx->bucket, ctx->endpoint, enc_key); +} +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + +/** + * s3_build_host + * construct the Host header value for S3 requests + * @param ctx S3 connector context + * @param host output host string + * @param host_size size of host buffer + */ +static void s3_build_host(const s3_ctx_t *ctx, char *host, size_t host_size) +{ + if (ctx->use_path_style) + snprintf(host, host_size, "%s", ctx->endpoint); + else + snprintf(host, host_size, "%s.%s", ctx->bucket, ctx->endpoint); +} + +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +/** + * s3_sign_raw + * create AWS SigV4 signed HTTP headers given explicit canonical URI and query string. + * this is the low-level signing function used by both object operations and list requests. + * @param ctx S3 connector context + * @param method HTTP method (GET, PUT, DELETE, HEAD) + * @param canonical_uri the URI path component of the request (e.g. "/bucket/key") + * @param canonical_query_string the query string component (alphabetically sorted, or "") + * @param content_sha256 hex-encoded SHA256 of the request body + * @param extra_headers_canonical additional canonical headers (or NULL) + * @param extra_signed_headers additional signed header names (or NULL) + * @return curl_slist of signed headers (caller must free with curl_slist_free_all) + */ +static struct curl_slist *s3_sign_raw(const s3_ctx_t *ctx, const char *method, + const char *canonical_uri, const char *canonical_query_string, + const char *content_sha256, + const char *extra_headers_canonical, + const char *extra_signed_headers) +{ + char date8[TDB_S3_DATE_LEN], timestamp[TDB_S3_TIMESTAMP_LEN]; + s3_get_timestamp(date8, timestamp); + + char host[TDB_S3_HOST_BUF]; + s3_build_host(ctx, host, sizeof(host)); + + /* canonical request */ + char canonical_request[TDB_S3_MAX_PATH * 4]; + snprintf(canonical_request, sizeof(canonical_request), + "%s\n%s\n%s\nhost:%s\nx-amz-content-sha256:%s\nx-amz-date:%s\n%s\n" + "host;x-amz-content-sha256;x-amz-date%s\n%s", + method, canonical_uri, canonical_query_string ? canonical_query_string : "", host, + content_sha256, timestamp, extra_headers_canonical ? extra_headers_canonical : "", + extra_signed_headers ? extra_signed_headers : "", content_sha256); + + char canonical_hash[TDB_S3_HASH_HEX_LEN]; + sha256_hex(canonical_request, strlen(canonical_request), canonical_hash); + + /* string to sign */ + char scope[TDB_S3_SCOPE_BUF]; + snprintf(scope, sizeof(scope), "%s/%s/s3/aws4_request", date8, ctx->region); + + char string_to_sign[TDB_S3_STS_BUF]; + snprintf(string_to_sign, sizeof(string_to_sign), "AWS4-HMAC-SHA256\n%s\n%s\n%s", timestamp, + scope, canonical_hash); + + /* signature */ + unsigned char signing_key[TDB_S3_SHA256_DIGEST]; + unsigned int sk_len; + s3_signing_key(ctx->secret_key, date8, ctx->region, signing_key, &sk_len); + + unsigned char sig_raw[TDB_S3_SHA256_DIGEST]; + unsigned int sig_len; + hmac_sha256(signing_key, sk_len, string_to_sign, strlen(string_to_sign), sig_raw, &sig_len); + + char sig_hex[TDB_S3_HASH_HEX_LEN]; + for (unsigned int i = 0; i < sig_len; i++) sprintf(sig_hex + i * 2, "%02x", sig_raw[i]); + sig_hex[sig_len * 2] = '\0'; + + char auth_header[TDB_S3_MAX_HEADER]; + snprintf(auth_header, sizeof(auth_header), + "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s, " + "SignedHeaders=host;x-amz-content-sha256;x-amz-date%s, Signature=%s", + ctx->access_key, scope, extra_signed_headers ? extra_signed_headers : "", sig_hex); + + /* we build curl headers */ + struct curl_slist *headers = NULL; + char hdr[TDB_S3_MAX_HEADER]; + + snprintf(hdr, sizeof(hdr), "Host: %s", host); + headers = curl_slist_append(headers, hdr); + + snprintf(hdr, sizeof(hdr), "x-amz-date: %s", timestamp); + headers = curl_slist_append(headers, hdr); + + snprintf(hdr, sizeof(hdr), "x-amz-content-sha256: %s", content_sha256); + headers = curl_slist_append(headers, hdr); + + headers = curl_slist_append(headers, auth_header); + + return headers; +} +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + +/** + * s3_sign_request + * create AWS SigV4 signed HTTP headers for an S3 object request. + * computes the canonical URI from the key and connector prefix, + * then delegates to s3_sign_raw with an empty query string. + * @param ctx S3 connector context + * @param method HTTP method (GET, PUT, DELETE, HEAD) + * @param key object key + * @param content_sha256 hex-encoded SHA256 of the request body + * @param extra_headers_canonical additional canonical headers (or NULL) + * @param extra_signed_headers additional signed header names (or NULL) + * @return curl_slist of signed headers (caller must free with curl_slist_free_all) + */ +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +static struct curl_slist *s3_sign_request(const s3_ctx_t *ctx, const char *method, const char *key, + const char *content_sha256, + const char *extra_headers_canonical, + const char *extra_signed_headers) +{ + char full_key[TDB_S3_MAX_PATH]; + if (ctx->prefix[0]) + snprintf(full_key, sizeof(full_key), "%s%s", ctx->prefix, key); + else + snprintf(full_key, sizeof(full_key), "%s", key); + + /* URI-encode the key path exactly as s3_build_url does, or the signature will not + * match the request for keys containing characters outside [A-Za-z0-9-._~/] */ + char enc_key[TDB_S3_MAX_PATH * 3]; + s3_uri_encode_path(full_key, enc_key, sizeof(enc_key)); + + char canonical_uri[TDB_S3_MAX_PATH * 3 + 256]; + if (ctx->use_path_style) + snprintf(canonical_uri, sizeof(canonical_uri), "/%s/%s", ctx->bucket, enc_key); + else + snprintf(canonical_uri, sizeof(canonical_uri), "/%s", enc_key); + + return s3_sign_raw(ctx, method, canonical_uri, "", content_sha256, extra_headers_canonical, + extra_signed_headers); +} +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + +/** + * s3_write_ctx_t + * context for curl write callbacks, supports writing to file or buffer + * @param fp file pointer for file-based writes (NULL if writing to buffer) + * @param buf buffer pointer for in-memory writes (NULL if writing to file) + * @param buf_size total size of the output buffer + * @param written number of bytes written so far + */ +typedef struct +{ + FILE *fp; + char *buf; + size_t buf_size; + size_t written; +} s3_write_ctx_t; + +/** + * s3_write_to_file + * curl write callback that writes received data to a file + * @param ptr pointer to received data + * @param size size of each element + * @param nmemb number of elements + * @param userdata pointer to s3_write_ctx_t with fp set + * @return number of bytes written + */ +static size_t s3_write_to_file(void *ptr, size_t size, size_t nmemb, void *userdata) +{ + s3_write_ctx_t *wctx = (s3_write_ctx_t *)userdata; + return fwrite(ptr, size, nmemb, wctx->fp); +} + +/** + * s3_write_to_buf + * curl write callback that copies received data into a fixed-size buffer + * @param ptr pointer to received data + * @param size size of each element + * @param nmemb number of elements + * @param userdata pointer to s3_write_ctx_t with buf and buf_size set + * @return number of bytes consumed (always size * nmemb to avoid curl error) + */ +static size_t s3_write_to_buf(void *ptr, size_t size, size_t nmemb, void *userdata) +{ + s3_write_ctx_t *wctx = (s3_write_ctx_t *)userdata; + size_t bytes = size * nmemb; + size_t avail = wctx->buf_size - wctx->written; + size_t to_copy = bytes < avail ? bytes : avail; + memcpy(wctx->buf + wctx->written, ptr, to_copy); + wctx->written += to_copy; + return bytes; /* always consume all data to avoid curl error */ +} + +/** + * s3_write_discard + * curl write callback that discards all received data + * @param ptr pointer to received data (unused) + * @param size size of each element + * @param nmemb number of elements + * @param userdata unused + * @return number of bytes consumed (always size * nmemb) + */ +static size_t s3_write_discard(void *ptr, size_t size, size_t nmemb, void *userdata) +{ + (void)ptr; + (void)userdata; + return size * nmemb; +} + +/** + * s3_get + * download an S3 object to a local file, creating parent directories as needed + * @param ctx opaque S3 connector context + * @param key object key + * @param local_path path to write the downloaded file + * @return 0 on success, -1 on error (including not found) + */ +static int s3_get(void *ctx, const char *key, const char *local_path) +{ + s3_ctx_t *s3 = (s3_ctx_t *)ctx; + + char empty_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex("", 0, empty_sha); + + struct curl_slist *headers = s3_sign_request(s3, "GET", key, empty_sha, NULL, NULL); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + + /* we create parent directories for local_path */ + char dir_buf[TDB_S3_MAX_PATH]; + snprintf(dir_buf, sizeof(dir_buf), "%s", local_path); + char *sep = strrchr(dir_buf, '/'); + if (sep) + { + *sep = '\0'; + + for (char *p = dir_buf + 1; *p; p++) + { + if (*p == '/') + { + *p = '\0'; + mkdir(dir_buf, TDB_S3_DIR_MODE); + *p = '/'; + } + } + mkdir(dir_buf, 0755); + } + + FILE *fp = fopen(local_path, "wb"); + if (!fp) + { + curl_slist_free_all(headers); + return -1; + } + + s3_write_ctx_t wctx = {.fp = fp}; + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + fclose(fp); + unlink(local_path); + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_to_file); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &wctx); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + + fclose(fp); + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + if (res != CURLE_OK || http_code < TDB_S3_HTTP_OK || http_code >= TDB_S3_HTTP_REDIRECT) + { + unlink(local_path); + return -1; + } + return 0; +} + +/** + * s3_range_get + * download a byte range of an S3 object into a caller-allocated buffer + * @param ctx opaque S3 connector context + * @param key object key + * @param offset byte offset to start reading + * @param buf output buffer (caller allocated) + * @param size number of bytes to read + * @return bytes read on success, -1 on error + */ +static ssize_t s3_range_get(void *ctx, const char *key, uint64_t offset, void *buf, size_t size) +{ + s3_ctx_t *s3 = (s3_ctx_t *)ctx; + + char empty_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex("", 0, empty_sha); + + /* we sign without Range header -- S3/MinIO does not require Range to be signed */ + struct curl_slist *headers = s3_sign_request(s3, "GET", key, empty_sha, NULL, NULL); + + char range_hdr[128]; + snprintf(range_hdr, sizeof(range_hdr), "Range: bytes=%" PRIu64 "-%" PRIu64, offset, + offset + size - 1); + headers = curl_slist_append(headers, range_hdr); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + + s3_write_ctx_t wctx = {.buf = (char *)buf, .buf_size = size, .written = 0}; + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_to_buf); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &wctx); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + if (res != CURLE_OK || (http_code != TDB_S3_HTTP_OK && http_code != TDB_S3_HTTP_PARTIAL)) + return -1; + return (ssize_t)wctx.written; +} + +/** + * s3_delete_object + * delete an object from S3. not-found is not an error. + * @param ctx opaque S3 connector context + * @param key object key to delete + * @return 0 on success, -1 on error + */ +static int s3_delete_object(void *ctx, const char *key) +{ + s3_ctx_t *s3 = (s3_ctx_t *)ctx; + + char empty_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex("", 0, empty_sha); + + struct curl_slist *headers = s3_sign_request(s3, "DELETE", key, empty_sha, NULL, NULL); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "DELETE"); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_discard); + + CURLcode res = curl_easy_perform(curl); + + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + if (res != CURLE_OK) return -1; + /* 2xx (200/204 No Content) = deleted, 404 Not Found = already absent; both are success. + * any other status (403, 5xx, ...) is a real failure that must NOT be masked, or the + * integration layer's retry/cleanup is silently defeated. */ + if ((http_code >= TDB_S3_HTTP_OK && http_code < TDB_S3_HTTP_REDIRECT) || + http_code == TDB_S3_HTTP_NOT_FOUND) + return 0; + return -1; +} + +/** + * s3_exists + * check if an S3 object exists and optionally return its size via HEAD request + * @param ctx opaque S3 connector context + * @param key object key + * @param size_out if non-NULL, receives the object size in bytes + * @return 1 if exists, 0 if not, -1 on error + */ +static int s3_exists(void *ctx, const char *key, size_t *size_out) +{ + s3_ctx_t *s3 = (s3_ctx_t *)ctx; + + char empty_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex("", 0, empty_sha); + + struct curl_slist *headers = s3_sign_request(s3, "HEAD", key, empty_sha, NULL, NULL); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_discard); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + + if (size_out && res == CURLE_OK && http_code == TDB_S3_HTTP_OK) + { + curl_off_t cl = 0; + curl_easy_getinfo(curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, &cl); + *size_out = (size_t)cl; + } + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + if (res != CURLE_OK) return -1; + return (http_code == TDB_S3_HTTP_OK) ? 1 : 0; +} + +/** + * xml_find_tag + * simple XML tag extraction for ListObjectsV2 response parsing + * @param xml XML string to search in + * @param tag tag name to find (without angle brackets) + * @param value_len receives the length of the tag's text content + * @return pointer to the start of the tag value, or NULL if not found + */ +static const char *xml_find_tag(const char *xml, const char *tag, size_t *value_len) +{ + char open_tag[TDB_S3_XML_TAG_BUF]; + snprintf(open_tag, sizeof(open_tag), "<%s>", tag); + const char *start = strstr(xml, open_tag); + if (!start) return NULL; + start += strlen(open_tag); + + char close_tag[TDB_S3_XML_TAG_BUF]; + snprintf(close_tag, sizeof(close_tag), "", tag); + const char *end = strstr(start, close_tag); + if (!end) return NULL; + + *value_len = end - start; + return start; +} + +/** + * s3_response_buf_t + * growable buffer for accumulating HTTP response data + * @param data heap-allocated buffer holding response bytes + * @param size number of bytes currently stored + * @param capacity total allocated capacity of data buffer + */ +typedef struct +{ + char *data; + size_t size; + size_t capacity; +} s3_response_buf_t; + +/** + * s3_write_to_response + * curl write callback that appends received data to a growable response buffer + * @param ptr pointer to received data + * @param size size of each element + * @param nmemb number of elements + * @param userdata pointer to s3_response_buf_t + * @return number of bytes consumed, or 0 on allocation failure + */ +static size_t s3_write_to_response(void *ptr, size_t size, size_t nmemb, void *userdata) +{ + s3_response_buf_t *buf = (s3_response_buf_t *)userdata; + size_t bytes = size * nmemb; + if (buf->size + bytes >= buf->capacity) + { + size_t new_cap = (buf->capacity + bytes) * 2; + char *new_data = realloc(buf->data, new_cap); + if (!new_data) return 0; + buf->data = new_data; + buf->capacity = new_cap; + } + memcpy(buf->data + buf->size, ptr, bytes); + buf->size += bytes; + buf->data[buf->size] = '\0'; + return bytes; +} + +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif + +/** + * s3_full_key + * build the connector-prefixed object key (prefix + key). + * @param s3 S3 connector context + * @param key caller object key + * @param out output buffer + * @param out_size size of the output buffer + */ +static void s3_full_key(const s3_ctx_t *s3, const char *key, char *out, size_t out_size) +{ + if (s3->prefix[0]) + snprintf(out, out_size, "%s%s", s3->prefix, key); + else + snprintf(out, out_size, "%s", key); +} + +/** + * s3_canonical_uri + * build the SigV4 canonical URI for a full object key -- "/bucket/key" for + * path-style addressing, "/key" for virtual-hosted style. + * @param s3 S3 connector context + * @param full_key prefixed object key + * @param out output buffer + * @param out_size size of the output buffer + */ +static void s3_canonical_uri(const s3_ctx_t *s3, const char *full_key, char *out, size_t out_size) +{ + if (s3->use_path_style) + snprintf(out, out_size, "/%s/%s", s3->bucket, full_key); + else + snprintf(out, out_size, "/%s", full_key); +} + +/** + * s3_header_ctx_t + * context for the multipart ETag response-header capture callback. + * @param etag receives the part ETag value (quotes included, as returned) + * @param found set to 1 once an ETag header has been captured + */ +typedef struct +{ + char etag[TDB_S3_ETAG_MAX]; + int found; +} s3_header_ctx_t; + +/** + * s3_capture_etag_header + * curl header callback that captures the ETag response header of an + * UploadPart request. header field names are case-insensitive per RFC 7230. + * @param buffer header line bytes (not NUL terminated) + * @param size size of each element + * @param nitems number of elements + * @param userdata pointer to s3_header_ctx_t + * @return number of bytes consumed (must equal size * nitems) + */ +static size_t s3_capture_etag_header(char *buffer, size_t size, size_t nitems, void *userdata) +{ + s3_header_ctx_t *h = (s3_header_ctx_t *)userdata; + size_t len = size * nitems; + if (len >= 5) + { + char name[6]; + for (int i = 0; i < 5; i++) name[i] = (char)tolower((unsigned char)buffer[i]); + name[5] = '\0'; + if (strcmp(name, "etag:") == 0) + { + const char *v = buffer + 5; + size_t vlen = len - 5; + while (vlen > 0 && (*v == ' ' || *v == '\t')) + { + v++; + vlen--; + } + while (vlen > 0 && (v[vlen - 1] == '\r' || v[vlen - 1] == '\n' || v[vlen - 1] == ' ')) + vlen--; + if (vlen >= sizeof(h->etag)) vlen = sizeof(h->etag) - 1; + memcpy(h->etag, v, vlen); + h->etag[vlen] = '\0'; + h->found = 1; + } + } + return len; +} + +/** + * s3_multipart_create + * issue CreateMultipartUpload (POST ?uploads) and parse the upload + * id out of the XML response. + * @param s3 S3 connector context + * @param key object key + * @param upload_id_out receives the upload id + * @param upload_id_size size of the upload id buffer + * @return 0 on success, -1 on error + */ +static int s3_multipart_create(s3_ctx_t *s3, const char *key, char *upload_id_out, + size_t upload_id_size) +{ + char empty_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex("", 0, empty_sha); + + char full_key[TDB_S3_MAX_PATH]; + s3_full_key(s3, key, full_key, sizeof(full_key)); + char canonical_uri[TDB_S3_MAX_PATH + 512]; + s3_canonical_uri(s3, full_key, canonical_uri, sizeof(canonical_uri)); + + struct curl_slist *headers = + s3_sign_raw(s3, "POST", canonical_uri, "uploads=", empty_sha, NULL, NULL); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + char full_url[TDB_S3_MAX_PATH + 16]; + snprintf(full_url, sizeof(full_url), "%s?uploads", url); + + s3_response_buf_t resp = { + .data = malloc(TDB_S3_RESPONSE_INIT), .size = 0, .capacity = TDB_S3_RESPONSE_INIT}; + if (!resp.data) + { + curl_slist_free_all(headers); + return -1; + } + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + free(resp.data); + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, ""); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 0L); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_to_response); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + int rc = -1; + if (res == CURLE_OK && http_code == TDB_S3_HTTP_OK) + { + size_t id_len = 0; + const char *id = xml_find_tag(resp.data, "UploadId", &id_len); + if (id && id_len > 0 && id_len < upload_id_size) + { + memcpy(upload_id_out, id, id_len); + upload_id_out[id_len] = '\0'; + rc = 0; + } + } + free(resp.data); + return rc; +} + +/** + * s3_upload_part + * upload one part of a multipart upload (PUT ?partNumber=N&uploadId=I) + * and capture the part ETag from the response. the part body is small enough + * to hash, so each part keeps end-to-end integrity via x-amz-content-sha256. + * @param s3 S3 connector context + * @param key object key + * @param upload_id multipart upload id + * @param part_number 1-based part number + * @param part_data part bytes + * @param part_len number of part bytes + * @param etag_out receives the part ETag + * @param etag_size size of the ETag buffer + * @return 0 on success, -1 on error + */ +static int s3_upload_part(s3_ctx_t *s3, const char *key, const char *upload_id, int part_number, + const void *part_data, size_t part_len, char *etag_out, size_t etag_size) +{ + char part_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex(part_data, part_len, part_sha); + + char enc_id[TDB_S3_UPLOAD_ID_MAX * 4]; + s3_uri_encode(upload_id, enc_id, sizeof(enc_id)); + + char canonical_qs[TDB_S3_UPLOAD_ID_MAX * 4 + 64]; + snprintf(canonical_qs, sizeof(canonical_qs), "partNumber=%d&uploadId=%s", part_number, enc_id); + + char full_key[TDB_S3_MAX_PATH]; + s3_full_key(s3, key, full_key, sizeof(full_key)); + char canonical_uri[TDB_S3_MAX_PATH + 512]; + s3_canonical_uri(s3, full_key, canonical_uri, sizeof(canonical_uri)); + + struct curl_slist *headers = + s3_sign_raw(s3, "PUT", canonical_uri, canonical_qs, part_sha, NULL, NULL); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + char full_url[TDB_S3_MAX_PATH + TDB_S3_UPLOAD_ID_MAX * 4 + 64]; + snprintf(full_url, sizeof(full_url), "%s?partNumber=%d&uploadId=%s", url, part_number, enc_id); + + FILE *mem_fp = tdb_fmemopen((void *)part_data, part_len, "rb"); + if (!mem_fp) + { + curl_slist_free_all(headers); + return -1; + } + + s3_header_ctx_t hctx = {.found = 0}; + CURL *curl = s3_curl_new(s3); + if (!curl) + { + fclose(mem_fp); + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_UPLOAD, 1L); + curl_easy_setopt(curl, CURLOPT_INFILESIZE_LARGE, (curl_off_t)part_len); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_READDATA, mem_fp); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_discard); + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, s3_capture_etag_header); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hctx); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + fclose(mem_fp); + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + if (res != CURLE_OK || http_code != TDB_S3_HTTP_OK || !hctx.found) return -1; + if (strlen(hctx.etag) >= etag_size) return -1; + snprintf(etag_out, etag_size, "%s", hctx.etag); + return 0; +} + +/** + * s3_multipart_complete + * issue CompleteMultipartUpload with the XML manifest of part numbers and + * ETags. S3 can return HTTP 200 with an body on failure, so the + * response payload is inspected, not only the status code. + * @param s3 S3 connector context + * @param key object key + * @param upload_id multipart upload id + * @param etags packed part ETags, TDB_S3_ETAG_MAX bytes per entry + * @param part_count number of parts + * @return 0 on success, -1 on error + */ +static int s3_multipart_complete(s3_ctx_t *s3, const char *key, const char *upload_id, + const char *etags, int part_count) +{ + size_t body_cap = (size_t)part_count * (TDB_S3_ETAG_MAX + 64) + 64; + char *body = malloc(body_cap); + if (!body) return -1; + + size_t off = 0; + off += (size_t)snprintf(body + off, body_cap - off, ""); + for (int i = 0; i < part_count; i++) + { + off += (size_t)snprintf(body + off, body_cap - off, + "%d%s", i + 1, + etags + (size_t)i * TDB_S3_ETAG_MAX); + } + off += (size_t)snprintf(body + off, body_cap - off, ""); + + char body_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex(body, off, body_sha); + + char enc_id[TDB_S3_UPLOAD_ID_MAX * 4]; + s3_uri_encode(upload_id, enc_id, sizeof(enc_id)); + char canonical_qs[TDB_S3_UPLOAD_ID_MAX * 4 + 32]; + snprintf(canonical_qs, sizeof(canonical_qs), "uploadId=%s", enc_id); + + char full_key[TDB_S3_MAX_PATH]; + s3_full_key(s3, key, full_key, sizeof(full_key)); + char canonical_uri[TDB_S3_MAX_PATH + 512]; + s3_canonical_uri(s3, full_key, canonical_uri, sizeof(canonical_uri)); + + struct curl_slist *headers = + s3_sign_raw(s3, "POST", canonical_uri, canonical_qs, body_sha, NULL, NULL); + + char url[TDB_S3_MAX_PATH]; + s3_build_url(s3, key, url, sizeof(url)); + char full_url[TDB_S3_MAX_PATH + TDB_S3_UPLOAD_ID_MAX * 4 + 32]; + snprintf(full_url, sizeof(full_url), "%s?uploadId=%s", url, enc_id); + + s3_response_buf_t resp = { + .data = malloc(TDB_S3_RESPONSE_INIT), .size = 0, .capacity = TDB_S3_RESPONSE_INIT}; + if (!resp.data) + { + free(body); + curl_slist_free_all(headers); + return -1; + } + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + free(body); + free(resp.data); + curl_slist_free_all(headers); + return -1; + } + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_POST, 1L); + curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body); + curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, (long)off); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_to_response); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + free(body); + + /* CompleteMultipartUpload can return HTTP 200 with an body, so the + * success result element must be present and no error element present */ + int rc = -1; + if (res == CURLE_OK && http_code == TDB_S3_HTTP_OK && resp.data && + strstr(resp.data, "= TDB_S3_HTTP_OK && http_code < TDB_S3_HTTP_REDIRECT) + ? 0 + : -1; +} + +/** + * s3_put_multipart + * upload a large object as a multipart upload -- create, stream fixed-size + * parts from the file, then complete. on any failure the upload is aborted + * so no orphaned parts remain. only one part is held in memory at a time, so + * memory use is bounded regardless of file size. + * @param s3 S3 connector context + * @param key object key + * @param fp open file positioned at offset 0 + * @param file_size size of the file in bytes + * @return 0 on success, -1 on error + */ +static int s3_put_multipart(s3_ctx_t *s3, const char *key, FILE *fp, long file_size) +{ + const size_t part_size = s3->multipart_part_size; /* resolved to a default at create time */ + + long parts_needed = (long)(((size_t)file_size + part_size - 1) / part_size); + if (parts_needed < 1) parts_needed = 1; + if (parts_needed > TDB_S3_MAX_PARTS) return -1; /* file too large for the part size */ + + char upload_id[TDB_S3_UPLOAD_ID_MAX]; + if (s3_multipart_create(s3, key, upload_id, sizeof(upload_id)) != 0) return -1; + + char *part_buf = malloc(part_size); + char *etags = malloc((size_t)parts_needed * TDB_S3_ETAG_MAX); + if (!part_buf || !etags) + { + free(part_buf); + free(etags); + s3_multipart_abort(s3, key, upload_id); + return -1; + } + + int part_count = 0; + int failed = 0; + for (;;) + { + size_t got = fread(part_buf, 1, part_size, fp); + if (got == 0) + { + if (ferror(fp)) failed = 1; + break; + } + if (part_count >= parts_needed) + { + failed = 1; /* file grew underneath us */ + break; + } + if (s3_upload_part(s3, key, upload_id, part_count + 1, part_buf, got, + etags + (size_t)part_count * TDB_S3_ETAG_MAX, TDB_S3_ETAG_MAX) != 0) + { + failed = 1; + break; + } + part_count++; + if (got < part_size) break; /* short read -- last part */ + } + + int rc = -1; + if (!failed && part_count > 0) + { + rc = s3_multipart_complete(s3, key, upload_id, etags, part_count); + } + if (rc != 0) s3_multipart_abort(s3, key, upload_id); + + free(part_buf); + free(etags); + return rc; +} + +/** + * s3_put + * upload a local file to S3 as an object. files below the multipart + * threshold use a single streaming PUT; files at or above it use a + * multipart upload, so the connector never buffers a whole large file in + * memory and is not bound by the 5 GiB single-PUT limit. + * @param ctx opaque S3 connector context + * @param key object key (path-like) + * @param local_path path to the local file to upload + * @return 0 on success, -1 on error + */ +static int s3_put(void *ctx, const char *key, const char *local_path) +{ + s3_ctx_t *s3 = (s3_ctx_t *)ctx; + + FILE *fp = fopen(local_path, "rb"); + if (!fp) return -1; + + if (fseek(fp, 0, SEEK_END) != 0) + { + fclose(fp); + return -1; + } + long file_size = ftell(fp); + if (file_size < 0) + { + fclose(fp); + return -1; + } + rewind(fp); + + int rc; + if ((size_t)file_size >= s3->multipart_threshold) + rc = s3_put_multipart(s3, key, fp, file_size); + else + rc = s3_put_single(s3, key, fp, file_size); + + fclose(fp); + return rc; +} + +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + +/** + * s3_list + * enumerate S3 objects under a key prefix using ListObjectsV2, handling pagination + * @param ctx opaque S3 connector context + * @param prefix key prefix to list (e.g. "cf_name/") + * @param cb callback invoked for each object (key, size, cb_ctx) + * @param cb_ctx opaque context passed to callback + * @return number of objects listed, -1 on error + */ +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif +static int s3_list(void *ctx, const char *prefix, + void (*cb)(const char *key, size_t size, void *cb_ctx), void *cb_ctx) +{ + s3_ctx_t *s3 = (s3_ctx_t *)ctx; + int count = 0; + char continuation_token[TDB_S3_CONT_TOKEN_MAX] = {0}; + + do + { + char empty_sha[TDB_S3_HASH_HEX_LEN]; + sha256_hex("", 0, empty_sha); + + /* we build full prefix with connector prefix */ + char full_prefix[TDB_S3_MAX_PATH]; + if (s3->prefix[0]) + snprintf(full_prefix, sizeof(full_prefix), "%s%s", s3->prefix, prefix); + else + snprintf(full_prefix, sizeof(full_prefix), "%s", prefix); + + /* ListObjectsV2 -- prefix goes in query string, not in the URL path. + * the canonical URI is just / (path-style) or / (virtual-hosted). + * the canonical query string must include all query parameters sorted + * alphabetically with URI-encoded values per the SigV4 spec. */ + char url[TDB_S3_MAX_PATH + TDB_S3_CONT_TOKEN_MAX * 2]; + const char *scheme = s3->use_ssl ? "https" : "http"; + + /* URI-encode prefix and continuation token for query string */ + char encoded_prefix[TDB_S3_MAX_PATH * 3]; + s3_uri_encode(full_prefix, encoded_prefix, sizeof(encoded_prefix)); + + char encoded_token[TDB_S3_CONT_TOKEN_MAX * 3]; + if (continuation_token[0]) + s3_uri_encode(continuation_token, encoded_token, sizeof(encoded_token)); + + /* we build canonical query string (params sorted alphabetically) */ + char canonical_qs[TDB_S3_MAX_PATH * 4]; + if (continuation_token[0]) + snprintf(canonical_qs, sizeof(canonical_qs), + "continuation-token=%s&list-type=2&prefix=%s", encoded_token, encoded_prefix); + else + snprintf(canonical_qs, sizeof(canonical_qs), "list-type=2&prefix=%s", encoded_prefix); + + if (s3->use_path_style) + { + snprintf(url, sizeof(url), "%s://%s/%s?%s", scheme, s3->endpoint, s3->bucket, + canonical_qs); + } + else + { + snprintf(url, sizeof(url), "%s://%s.%s/?%s", scheme, s3->bucket, s3->endpoint, + canonical_qs); + } + + /* we sign with the correct canonical URI (bucket path only, no object prefix) */ + char canonical_uri[TDB_S3_MAX_PATH]; + if (s3->use_path_style) + snprintf(canonical_uri, sizeof(canonical_uri), "/%s", s3->bucket); + else + snprintf(canonical_uri, sizeof(canonical_uri), "/"); + + struct curl_slist *headers = + s3_sign_raw(s3, "GET", canonical_uri, canonical_qs, empty_sha, NULL, NULL); + + s3_response_buf_t resp = { + .data = malloc(TDB_S3_RESPONSE_INIT), .size = 0, .capacity = TDB_S3_RESPONSE_INIT}; + if (!resp.data) + { + curl_slist_free_all(headers); + return count > 0 ? count : -1; + } + + CURL *curl = s3_curl_new(s3); + if (!curl) + { + free(resp.data); + curl_slist_free_all(headers); + return count > 0 ? count : -1; + } + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, s3_write_to_response); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp); + + CURLcode res = curl_easy_perform(curl); + long http_code = 0; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + + if (res != CURLE_OK || http_code != TDB_S3_HTTP_OK) + { + free(resp.data); + return count > 0 ? count : -1; + } + + /* we parse XML response for and tags within */ + const char *pos = resp.data; + while ((pos = strstr(pos, "")) != NULL) + { + const char *end = strstr(pos, ""); + if (!end) break; + + size_t key_len = 0, size_len = 0; + const char *key_val = xml_find_tag(pos, "Key", &key_len); + const char *size_val = xml_find_tag(pos, "Size", &size_len); + + if (key_val && key_len > 0) + { + char key_buf[TDB_S3_MAX_PATH]; + size_t copy_len = key_len < sizeof(key_buf) - 1 ? key_len : sizeof(key_buf) - 1; + memcpy(key_buf, key_val, copy_len); + key_buf[copy_len] = '\0'; + + /* we strip the connector prefix to get relative key */ + const char *relative = key_buf; + if (s3->prefix[0] && strncmp(relative, s3->prefix, strlen(s3->prefix)) == 0) + { + relative += strlen(s3->prefix); + } + + size_t obj_size = 0; + if (size_val && size_len > 0) + { + char size_buf[TDB_S3_SIZE_BUF]; + size_t sl = size_len < sizeof(size_buf) - 1 ? size_len : sizeof(size_buf) - 1; + memcpy(size_buf, size_val, sl); + size_buf[sl] = '\0'; + obj_size = (size_t)strtoull(size_buf, NULL, 10); + } + + cb(relative, obj_size, cb_ctx); + count++; + } + + pos = end + 1; + } + + /* we check for truncation (pagination) */ + continuation_token[0] = '\0'; + size_t ct_len = 0; + const char *ct = xml_find_tag(resp.data, "NextContinuationToken", &ct_len); + if (ct && ct_len > 0 && ct_len < TDB_S3_CONT_TOKEN_MAX) + { + memcpy(continuation_token, ct, ct_len); + continuation_token[ct_len] = '\0'; + } + + /* we check IsTruncated */ + size_t trunc_len = 0; + const char *trunc = xml_find_tag(resp.data, "IsTruncated", &trunc_len); + int is_truncated = (trunc && trunc_len == 4 && memcmp(trunc, "true", 4) == 0); + + free(resp.data); + + if (!is_truncated) break; + + } while (1); + + return count; +} +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + +/** + * s3_destroy + * free S3 connector resources + * @param ctx opaque S3 connector context to free + */ +static void s3_destroy(void *ctx) +{ + free(ctx); +} + +tidesdb_objstore_t *tidesdb_objstore_s3_create_config(const tidesdb_objstore_s3_config_t *config) +{ + if (!config || !config->endpoint || !config->bucket || !config->access_key || + !config->secret_key) + return NULL; + + curl_global_init(CURL_GLOBAL_DEFAULT); + + s3_ctx_t *s3 = calloc(1, sizeof(s3_ctx_t)); + if (!s3) return NULL; + + snprintf(s3->endpoint, sizeof(s3->endpoint), "%s", config->endpoint); + snprintf(s3->bucket, sizeof(s3->bucket), "%s", config->bucket); + if (config->prefix) snprintf(s3->prefix, sizeof(s3->prefix), "%s", config->prefix); + snprintf(s3->access_key, sizeof(s3->access_key), "%s", config->access_key); + snprintf(s3->secret_key, sizeof(s3->secret_key), "%s", config->secret_key); + snprintf(s3->region, sizeof(s3->region), "%s", + config->region ? config->region : TDB_S3_DEFAULT_REGION); + s3->use_ssl = config->use_ssl; + s3->use_path_style = config->use_path_style; + + /* TLS copy a custom CA bundle path if given; the secure default (empty path + + * skip_verify 0) leaves libcurl verifying peer+host against the system CA bundle. */ + if (config->tls_ca_path) + snprintf(s3->tls_ca_path, sizeof(s3->tls_ca_path), "%s", config->tls_ca_path); + s3->tls_insecure_skip_verify = config->tls_insecure_skip_verify; + + /* multipart honor the caller's tuning, falling back to the documented defaults */ + s3->multipart_threshold = + config->multipart_threshold ? config->multipart_threshold : TDB_S3_MULTIPART_THRESHOLD; + s3->multipart_part_size = + config->multipart_part_size ? config->multipart_part_size : TDB_S3_MULTIPART_PART_SIZE; + + tidesdb_objstore_t *store = calloc(1, sizeof(tidesdb_objstore_t)); + if (!store) + { + free(s3); + return NULL; + } + + store->backend = TDB_BACKEND_S3; + store->put = s3_put; + store->get = s3_get; + store->range_get = s3_range_get; + store->delete_object = s3_delete_object; + store->exists = s3_exists; + store->list = s3_list; + store->destroy = s3_destroy; + store->ctx = s3; + + return store; +} + +tidesdb_objstore_t *tidesdb_objstore_s3_create(const char *endpoint, const char *bucket, + const char *prefix, const char *access_key, + const char *secret_key, const char *region, + int use_ssl, int use_path_style) +{ + /* thin wrapper preserving the original signature, secure TLS defaults (verify peer+host + * against the system CA bundle) and default multipart tuning -- identical behavior to + * before this entry point existed. */ + const tidesdb_objstore_s3_config_t config = {.endpoint = endpoint, + .bucket = bucket, + .prefix = prefix, + .access_key = access_key, + .secret_key = secret_key, + .region = region, + .use_ssl = use_ssl, + .use_path_style = use_path_style, + .tls_ca_path = NULL, + .tls_insecure_skip_verify = 0, + .multipart_threshold = 0, + .multipart_part_size = 0}; + return tidesdb_objstore_s3_create_config(&config); +} + +#endif /* TIDESDB_WITH_S3 */ diff --git a/storage/tidesdb/libtidesdb/src/objstore_s3.h b/storage/tidesdb/libtidesdb/src/objstore_s3.h new file mode 100644 index 0000000000000..7a053091d6f5d --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/objstore_s3.h @@ -0,0 +1,89 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __OBJSTORE_S3_H__ +#define __OBJSTORE_S3_H__ + +#include "objstore.h" + +/** + * tidesdb_objstore_s3_create + * create an S3-compatible object store connector. + * works with AWS S3, MinIO, etc. + * + * @param endpoint S3 endpoint (e.g. "s3.amazonaws.com" or "minio.local:9000") + * @param bucket bucket name + * @param prefix key prefix (e.g. "production/db1/"), can be NULL + * @param access_key AWS access key ID + * @param secret_key AWS secret access key + * @param region AWS region (e.g. "us-east-1"), NULL for MinIO + * @param use_ssl 1 for HTTPS, 0 for HTTP + * @param use_path_style 1 for path-style URLs (MinIO), 0 for virtual-hosted (AWS) + * @return connector handle, or NULL on error + */ +tidesdb_objstore_t *tidesdb_objstore_s3_create(const char *endpoint, const char *bucket, + const char *prefix, const char *access_key, + const char *secret_key, const char *region, + int use_ssl, int use_path_style); + +/** + * tidesdb_objstore_s3_config_t + * full configuration for an S3 connector, including TLS and multipart tuning that the + * positional tidesdb_objstore_s3_create cannot express. zero-initialize and set the fields + * you need the all-zero defaults are secure (TLS verify on, no custom CA) and use the + * built-in multipart sizes. + * @param endpoint S3 endpoint (required) + * @param bucket bucket name (required) + * @param prefix key prefix, or NULL + * @param access_key AWS access key ID (required) + * @param secret_key AWS secret access key (required) + * @param region AWS region, or NULL for the default + * @param use_ssl 1 for HTTPS, 0 for HTTP + * @param use_path_style 1 for path-style URLs (MinIO), 0 for virtual-hosted (AWS) + * @param tls_ca_path custom CA bundle file path, or NULL for the system bundle + * @param tls_insecure_skip_verify 1 disables TLS peer+host verification (test endpoints + * ONLY -- insecure); 0 keeps verification on (default) + * @param multipart_threshold object size at/above which multipart upload is used; 0 = default + * @param multipart_part_size multipart chunk size in bytes; 0 = default + */ +typedef struct +{ + const char *endpoint; + const char *bucket; + const char *prefix; + const char *access_key; + const char *secret_key; + const char *region; + int use_ssl; + int use_path_style; + const char *tls_ca_path; + int tls_insecure_skip_verify; + size_t multipart_threshold; + size_t multipart_part_size; +} tidesdb_objstore_s3_config_t; + +/** + * tidesdb_objstore_s3_create_config + * create an S3-compatible connector from a full configuration struct (TLS + multipart). + * tidesdb_objstore_s3_create is a thin wrapper over this with secure/default settings. + * @param config connector configuration (fields are copied; need not outlive the call) + * @return connector handle, or NULL on error + */ +tidesdb_objstore_t *tidesdb_objstore_s3_create_config(const tidesdb_objstore_s3_config_t *config); + +#endif /* __OBJSTORE_S3_H__ */ diff --git a/storage/tidesdb/libtidesdb/src/queue.c b/storage/tidesdb/libtidesdb/src/queue.c new file mode 100644 index 0000000000000..45820a441dac2 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/queue.c @@ -0,0 +1,656 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "queue.h" + +#include "compat.h" + +#define QUEUE_LIKELY(x) TDB_LIKELY(x) +#define QUEUE_UNLIKELY(x) TDB_UNLIKELY(x) + +#define QUEUE_WAIT_TIMEOUT_NS 100000000 /* 100ms in nanoseconds */ +#define QUEUE_NS_PER_SEC 1000000000 /* nanoseconds per second */ + +/** + * queue_alloc_node + * allocate a node from pool or heap + * pool access is protected by pool_lock for thread safety + * @param queue the queue to allocate the node from + * @return the allocated node, or NULL on failure + */ +static inline queue_node_t *queue_alloc_node(queue_t *queue) +{ + if (QUEUE_UNLIKELY(atomic_load_explicit(&queue->pool_size, memory_order_relaxed) == 0)) + { + return (queue_node_t *)malloc(sizeof(queue_node_t)); + } + + pthread_mutex_lock(&queue->pool_lock); + + /* we check pool first (common case) */ + if (QUEUE_LIKELY(queue->node_pool != NULL)) + { + queue_node_t *node = queue->node_pool; + queue->node_pool = atomic_load_explicit(&node->next, memory_order_relaxed); + /* load+store avoids lock-prefixed instruction; mutex provides ordering */ + const size_t ps = atomic_load_explicit(&queue->pool_size, memory_order_relaxed); + atomic_store_explicit(&queue->pool_size, ps - 1, memory_order_relaxed); + pthread_mutex_unlock(&queue->pool_lock); + return node; + } + + pthread_mutex_unlock(&queue->pool_lock); + + /* pool empty, allocate from heap */ + return (queue_node_t *)malloc(sizeof(queue_node_t)); +} + +/** + * queue_free_node + * return node to pool or free it + * pool access is protected by pool_lock for thread safety + * @param queue the queue to return the node to + * @param node the node to return + */ +static inline void queue_free_node(queue_t *queue, queue_node_t *node) +{ + /* speculative lock-free check -- skip mutex when pool is full + * racy read is safe -- worst case we free when pool had room */ + if (QUEUE_UNLIKELY(atomic_load_explicit(&queue->pool_size, memory_order_relaxed) >= + queue->max_pool_size)) + { + free(node); + return; + } + + pthread_mutex_lock(&queue->pool_lock); + + const size_t ps = atomic_load_explicit(&queue->pool_size, memory_order_relaxed); + if (QUEUE_LIKELY(ps < queue->max_pool_size)) + { + /* return to pool */ + atomic_store_explicit(&node->next, queue->node_pool, memory_order_relaxed); + queue->node_pool = node; + /* load+store avoids lock-prefixed instruction; mutex provides ordering */ + atomic_store_explicit(&queue->pool_size, ps + 1, memory_order_relaxed); + pthread_mutex_unlock(&queue->pool_lock); + return; + } + + pthread_mutex_unlock(&queue->pool_lock); + + /* pool full, actually free */ + free(node); +} + +queue_t *queue_new(void) +{ + queue_t *queue = (queue_t *)malloc(sizeof(queue_t)); + if (queue == NULL) return NULL; + + /* we create a dummy node to separate head and tail + * this allows enqueue and dequeue to operate independently */ + queue_node_t *dummy = (queue_node_t *)malloc(sizeof(queue_node_t)); + if (dummy == NULL) + { + free(queue); + return NULL; + } + dummy->data = NULL; + atomic_store_explicit(&dummy->next, NULL, memory_order_relaxed); + + queue->head = dummy; + queue->tail = dummy; + queue->dummy = dummy; + atomic_store_explicit(&queue->size, 0, memory_order_relaxed); + atomic_store_explicit(&queue->shutdown, 0, memory_order_relaxed); + atomic_store_explicit(&queue->waiter_count, 0, memory_order_relaxed); + queue->node_pool = NULL; + atomic_store_explicit(&queue->pool_size, 0, memory_order_relaxed); + queue->max_pool_size = QUEUE_MAX_POOL_SIZE; + + if (pthread_mutex_init(&queue->head_lock, NULL) != 0) + { + free(dummy); + free(queue); + return NULL; + } + + if (pthread_mutex_init(&queue->tail_lock, NULL) != 0) + { + pthread_mutex_destroy(&queue->head_lock); + free(dummy); + free(queue); + return NULL; + } + + if (pthread_mutex_init(&queue->pool_lock, NULL) != 0) + { + pthread_mutex_destroy(&queue->tail_lock); + pthread_mutex_destroy(&queue->head_lock); + free(dummy); + free(queue); + return NULL; + } + + if (pthread_rwlock_init(&queue->read_lock, NULL) != 0) + { + pthread_mutex_destroy(&queue->pool_lock); + pthread_mutex_destroy(&queue->tail_lock); + pthread_mutex_destroy(&queue->head_lock); + free(dummy); + free(queue); + return NULL; + } + + if (pthread_cond_init(&queue->not_empty, NULL) != 0) + { + pthread_rwlock_destroy(&queue->read_lock); + pthread_mutex_destroy(&queue->pool_lock); + pthread_mutex_destroy(&queue->tail_lock); + pthread_mutex_destroy(&queue->head_lock); + free(dummy); + free(queue); + return NULL; + } + + return queue; +} + +int queue_enqueue(queue_t *queue, void *data) +{ + if (QUEUE_UNLIKELY(queue == NULL)) return -1; + + queue_node_t *node = queue_alloc_node(queue); + if (QUEUE_UNLIKELY(node == NULL)) + { + return -1; + } + + node->data = data; + atomic_store_explicit(&node->next, NULL, memory_order_relaxed); + + /* we only lock tail for enqueue -- the head operations are independent */ + pthread_mutex_lock(&queue->tail_lock); + + /* bump size BEFORE publishing the node. if we published first, a concurrent + * dequeue could observe the node and decrement size before this increment, + * transiently underflowing the unsigned counter to SIZE_MAX. incrementing + * first means size can only briefly over-count (node not yet visible), which + * is the safe direction for an approximate counter. */ + atomic_fetch_add_explicit(&queue->size, 1, memory_order_release); + + /* release publish -- node->data and node->next above must be visible to + * any consumer that acquire-loads this next pointer under head_lock */ + atomic_store_explicit(&queue->tail->next, node, memory_order_release); + queue->tail = node; + + const int has_waiters = atomic_load_explicit(&queue->waiter_count, memory_order_acquire) > 0; + + pthread_mutex_unlock(&queue->tail_lock); + + /* signal one waiter per enqueued item whenever any thread is blocked. signaling + * only on the empty->non-empty transition loses wakeups under multiple waiters: + * a burst of items past the first leaves later waiters asleep until the 100ms + * poll, serializing what should be a parallel wakeup. we signal outside the + * tail_lock to keep its hold time short. */ + if (has_waiters) + { + pthread_mutex_lock(&queue->head_lock); + pthread_cond_signal(&queue->not_empty); + pthread_mutex_unlock(&queue->head_lock); + } + + return 0; +} + +/** + * queue_dequeue_internal + * internal helper for dequeue logic (head_lock must be held) + * uses dummy node technique for lock-free separation of head and tail + * @param queue the queue + * @return pointer to dequeued data, NULL if queue is empty + */ +static inline void *queue_dequeue_internal(queue_t *queue) +{ + queue_node_t *old_head = queue->head; + /* acquire consume -- pairs with the release publish in queue_enqueue so the + * dequeued node's data is visible despite head_lock != tail_lock */ + queue_node_t *new_head = atomic_load_explicit(&old_head->next, memory_order_acquire); + + /* if next is NULL, queue is empty */ + if (QUEUE_UNLIKELY(new_head == NULL)) + { + return NULL; + } + + /* we advance head to next node (which becomes new dummy) */ + void *data = new_head->data; + new_head->data = NULL; /* clear data since this node becomes the new dummy */ + queue->head = new_head; + + atomic_fetch_sub_explicit(&queue->size, 1, memory_order_relaxed); + + /* return old dummy node to pool */ + queue_free_node(queue, old_head); + + return data; +} + +void *queue_dequeue(queue_t *queue) +{ + if (QUEUE_UNLIKELY(queue == NULL)) return NULL; + + pthread_rwlock_wrlock(&queue->read_lock); + pthread_mutex_lock(&queue->head_lock); + void *data = queue_dequeue_internal(queue); + pthread_mutex_unlock(&queue->head_lock); + pthread_rwlock_unlock(&queue->read_lock); + + return data; +} + +void *queue_dequeue_wait(queue_t *queue) +{ + if (QUEUE_UNLIKELY(queue == NULL)) return NULL; + + /* we spin briefly before blocking to avoid syscall overhead */ + for (int i = 0; i < QUEUE_SPIN_COUNT; i++) + { + if (atomic_load_explicit(&queue->size, memory_order_acquire) > 0) + { + pthread_rwlock_wrlock(&queue->read_lock); + pthread_mutex_lock(&queue->head_lock); + void *data = queue_dequeue_internal(queue); + pthread_mutex_unlock(&queue->head_lock); + pthread_rwlock_unlock(&queue->read_lock); + if (data != NULL) + { + return data; + } + } + cpu_pause(); + } + + /* we fall back to blocking wait */ + pthread_mutex_lock(&queue->head_lock); + + atomic_fetch_add_explicit(&queue->waiter_count, 1, memory_order_relaxed); + + while (atomic_load_explicit(&queue->head->next, memory_order_acquire) == NULL && + !atomic_load_explicit(&queue->shutdown, memory_order_acquire)) + { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_nsec += QUEUE_WAIT_TIMEOUT_NS; + if (ts.tv_nsec >= QUEUE_NS_PER_SEC) + { + ts.tv_sec += 1; + ts.tv_nsec -= QUEUE_NS_PER_SEC; + } + pthread_cond_timedwait(&queue->not_empty, &queue->head_lock, &ts); + } + + const int remaining_waiters = + atomic_fetch_sub_explicit(&queue->waiter_count, 1, memory_order_relaxed) - 1; + + /* we broadcast when last waiter exits to wake queue_free if waiting */ + if (remaining_waiters == 0) + { + pthread_cond_broadcast(&queue->not_empty); + } + + /* if shutdown and no data, return NULL */ + if (QUEUE_UNLIKELY(atomic_load_explicit(&queue->shutdown, memory_order_acquire) && + atomic_load_explicit(&queue->head->next, memory_order_acquire) == NULL)) + { + pthread_mutex_unlock(&queue->head_lock); + return NULL; + } + + /*** we acquire write lock to coordinate with readers, then dequeue + ** we must re-check for data after re-acquiring locks since another thread + * could have stolen the item while we released head_lock */ + while (1) + { + pthread_mutex_unlock(&queue->head_lock); + pthread_rwlock_wrlock(&queue->read_lock); + pthread_mutex_lock(&queue->head_lock); + + /** we check if data is still available */ + if (atomic_load_explicit(&queue->head->next, memory_order_acquire) != NULL) + { + void *data = queue_dequeue_internal(queue); + pthread_mutex_unlock(&queue->head_lock); + pthread_rwlock_unlock(&queue->read_lock); + return data; + } + + /* data was stolen! release locks and wait again */ + pthread_rwlock_unlock(&queue->read_lock); + + if (atomic_load_explicit(&queue->shutdown, memory_order_acquire)) + { + pthread_mutex_unlock(&queue->head_lock); + return NULL; + } + + /* we increment waiter count and wait for more data */ + atomic_fetch_add_explicit(&queue->waiter_count, 1, memory_order_relaxed); + + while (atomic_load_explicit(&queue->head->next, memory_order_acquire) == NULL && + !atomic_load_explicit(&queue->shutdown, memory_order_acquire)) + { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_nsec += QUEUE_WAIT_TIMEOUT_NS; + if (ts.tv_nsec >= QUEUE_NS_PER_SEC) + { + ts.tv_sec += 1; + ts.tv_nsec -= QUEUE_NS_PER_SEC; + } + pthread_cond_timedwait(&queue->not_empty, &queue->head_lock, &ts); + } + + atomic_fetch_sub_explicit(&queue->waiter_count, 1, memory_order_relaxed); + + /* we check for shutdown after waking */ + if (atomic_load_explicit(&queue->shutdown, memory_order_acquire) && + atomic_load_explicit(&queue->head->next, memory_order_acquire) == NULL) + { + pthread_mutex_unlock(&queue->head_lock); + return NULL; + } + } +} + +void *queue_peek(queue_t *queue) +{ + if (QUEUE_UNLIKELY(queue == NULL)) return NULL; + + pthread_rwlock_rdlock(&queue->read_lock); + + void *data = NULL; + /* with dummy node, actual data is in head->next */ + queue_node_t *first = atomic_load_explicit(&queue->head->next, memory_order_acquire); + if (QUEUE_LIKELY(first != NULL)) + { + data = first->data; + } + + pthread_rwlock_unlock(&queue->read_lock); + + return data; +} + +size_t queue_size(queue_t *queue) +{ + if (queue == NULL) return 0; + + return atomic_load_explicit(&queue->size, memory_order_relaxed); +} + +int queue_is_empty(queue_t *queue) +{ + if (queue == NULL) return -1; + + return (atomic_load_explicit(&queue->size, memory_order_relaxed) == 0) ? 1 : 0; +} + +int queue_clear(queue_t *queue) +{ + if (QUEUE_UNLIKELY(queue == NULL)) return -1; + + /* we lock write lock first, then both head and tail to ensure exclusive access */ + pthread_rwlock_wrlock(&queue->read_lock); + pthread_mutex_lock(&queue->head_lock); + pthread_mutex_lock(&queue->tail_lock); + + /* we free all nodes after the dummy -- exclusive locks held, so relaxed */ + queue_node_t *current = atomic_load_explicit(&queue->head->next, memory_order_relaxed); + while (current != NULL) + { + queue_node_t *next = atomic_load_explicit(¤t->next, memory_order_relaxed); + queue_free_node(queue, current); + current = next; + } + + /* we reset to empty state with just the dummy */ + atomic_store_explicit(&queue->head->next, NULL, memory_order_relaxed); + queue->tail = queue->head; + atomic_store_explicit(&queue->size, 0, memory_order_relaxed); + + pthread_mutex_unlock(&queue->tail_lock); + pthread_mutex_unlock(&queue->head_lock); + pthread_rwlock_unlock(&queue->read_lock); + + return 0; +} + +size_t queue_remove_if(queue_t *queue, int (*predicate)(void *data, void *context), void *context, + void (*on_remove)(void *data, void *context)) +{ + if (QUEUE_UNLIKELY(queue == NULL || predicate == NULL)) return 0; + + pthread_rwlock_wrlock(&queue->read_lock); + pthread_mutex_lock(&queue->head_lock); + pthread_mutex_lock(&queue->tail_lock); + + size_t removed = 0; + queue_node_t *prev = queue->head; /* dummy sentinel */ + /* exclusive locks held (rwlock-wr + head + tail), so relaxed throughout */ + queue_node_t *cur = atomic_load_explicit(&queue->head->next, memory_order_relaxed); + while (cur != NULL) + { + queue_node_t *cur_next = atomic_load_explicit(&cur->next, memory_order_relaxed); + if (predicate(cur->data, context)) + { + queue_node_t *victim = cur; + atomic_store_explicit(&prev->next, cur_next, memory_order_relaxed); + if (queue->tail == cur) queue->tail = prev; + cur = cur_next; + + if (on_remove) on_remove(victim->data, context); + queue_free_node(queue, victim); + removed++; + } + else + { + prev = cur; + cur = cur_next; + } + } + + if (removed > 0) + { + const size_t prior = atomic_load_explicit(&queue->size, memory_order_relaxed); + const size_t next_size = (prior > removed) ? (prior - removed) : 0; + atomic_store_explicit(&queue->size, next_size, memory_order_relaxed); + } + + pthread_mutex_unlock(&queue->tail_lock); + pthread_mutex_unlock(&queue->head_lock); + pthread_rwlock_unlock(&queue->read_lock); + + return removed; +} + +int queue_foreach(queue_t *queue, void (*fn)(void *data, void *context), void *context) +{ + if (QUEUE_UNLIKELY(queue == NULL)) return -1; + if (QUEUE_UNLIKELY(fn == NULL)) return -1; + + pthread_rwlock_rdlock(&queue->read_lock); + + int count = 0; + const queue_node_t *current = atomic_load_explicit(&queue->head->next, memory_order_acquire); + while (QUEUE_LIKELY(current != NULL)) + { + queue_node_t *next = atomic_load_explicit(¤t->next, memory_order_acquire); + if (QUEUE_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + } + fn(current->data, context); + count++; + current = next; + } + + pthread_rwlock_unlock(&queue->read_lock); + + return count; +} + +void *queue_peek_at(queue_t *queue, const size_t index) +{ + if (QUEUE_UNLIKELY(!queue)) return NULL; + + if (index >= atomic_load_explicit(&queue->size, memory_order_relaxed)) + { + return NULL; + } + + pthread_rwlock_rdlock(&queue->read_lock); + + /* with dummy node, actual data starts at head->next */ + const queue_node_t *current = atomic_load_explicit(&queue->head->next, memory_order_acquire); + for (size_t i = 0; i < index && QUEUE_LIKELY(current != NULL); i++) + { + queue_node_t *next = atomic_load_explicit(¤t->next, memory_order_acquire); + /* we prefetch next node to overlap memory latency with loop iteration */ + if (QUEUE_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + } + current = next; + } + + void *data = QUEUE_LIKELY(current != NULL) ? current->data : NULL; + + pthread_rwlock_unlock(&queue->read_lock); + + return data; +} + +size_t queue_snapshot(queue_t *queue, void **out, const size_t max_items) +{ + if (QUEUE_UNLIKELY(!queue || max_items == 0)) return 0; + /* out is indexed below; keep its null-check as its own plain statement so + * static analysis carries the non-null fact into the loop */ + if (out == NULL) return 0; + + pthread_rwlock_rdlock(&queue->read_lock); + + size_t count = 0; + const queue_node_t *current = atomic_load_explicit(&queue->head->next, memory_order_acquire); + while (QUEUE_LIKELY(current != NULL) && count < max_items) + { + out[count++] = current->data; + current = atomic_load_explicit(¤t->next, memory_order_acquire); + } + + pthread_rwlock_unlock(&queue->read_lock); + + return count; +} + +void queue_shutdown(queue_t *queue) +{ + if (queue == NULL) return; + + /* we set shutdown flag and wake all waiting threads */ + atomic_store_explicit(&queue->shutdown, 1, memory_order_release); + + pthread_mutex_lock(&queue->head_lock); + pthread_cond_broadcast(&queue->not_empty); + pthread_mutex_unlock(&queue->head_lock); +} + +void queue_free(queue_t *queue) +{ + queue_free_with_data(queue, NULL); +} + +void queue_free_with_data(queue_t *queue, void (*free_fn)(void *)) +{ + if (queue == NULL) return; + + /* we set shutdown flag and wake all waiting threads */ + atomic_store_explicit(&queue->shutdown, 1, memory_order_release); + + pthread_mutex_lock(&queue->head_lock); + pthread_cond_broadcast(&queue->not_empty); + + /* we wait for all waiting threads to exit before destroying primitives + * we use timed wait to handle BSD platforms where signals can be missed */ + while (atomic_load_explicit(&queue->waiter_count, memory_order_acquire) > 0) + { + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_nsec += QUEUE_WAIT_TIMEOUT_NS; + if (ts.tv_nsec >= QUEUE_NS_PER_SEC) + { + ts.tv_sec += 1; + ts.tv_nsec -= QUEUE_NS_PER_SEC; + } + pthread_cond_timedwait(&queue->not_empty, &queue->head_lock, &ts); + } + + pthread_mutex_lock(&queue->tail_lock); + + /* we free all nodes including the dummy, freeing user data */ + queue_node_t *current = queue->head; + while (current != NULL) + { + queue_node_t *next = atomic_load_explicit(¤t->next, memory_order_relaxed); + if (free_fn != NULL && current->data != NULL) + { + free_fn(current->data); + } + free(current); + current = next; + } + + pthread_mutex_lock(&queue->pool_lock); + current = queue->node_pool; + while (current != NULL) + { + queue_node_t *next = atomic_load_explicit(¤t->next, memory_order_relaxed); + free(current); + current = next; + } + queue->node_pool = NULL; + pthread_mutex_unlock(&queue->pool_lock); + + queue->head = NULL; + queue->tail = NULL; + queue->dummy = NULL; + atomic_store_explicit(&queue->size, 0, memory_order_relaxed); + + pthread_mutex_unlock(&queue->tail_lock); + pthread_mutex_unlock(&queue->head_lock); + + pthread_mutex_destroy(&queue->pool_lock); + pthread_rwlock_destroy(&queue->read_lock); + pthread_mutex_destroy(&queue->tail_lock); + pthread_mutex_destroy(&queue->head_lock); + pthread_cond_destroy(&queue->not_empty); + + free(queue); +} \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/queue.h b/storage/tidesdb/libtidesdb/src/queue.h new file mode 100644 index 0000000000000..9e4a71364287e --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/queue.h @@ -0,0 +1,214 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __QUEUE_H__ +#define __QUEUE_H__ +#include "compat.h" + +/* node pool configuration */ +#define QUEUE_MAX_POOL_SIZE 64 + +/* spin count before blocking in dequeue_wait */ +#define QUEUE_SPIN_COUNT 100 + +/** + * queue_node_t + * internal node structure for the queue + * @param data pointer to user data + * @param next pointer to next node, published with release and consumed with + * acquire -- this is the only happens-before edge across the separate + * head_lock / tail_lock, so a node's payload stays visible to consumers + */ +typedef struct queue_node_t +{ + void *data; + _Atomic(struct queue_node_t *) next; +} queue_node_t; + +/** + * queue_t + * thread-safe FIFO queue implementation with node pooling + * uses separate head and tail locks to reduce contention + * @param head pointer to first node (protected by head_lock) + * @param tail pointer to last node (protected by tail_lock) + * @param dummy sentinel node separating head and tail for lock independence + * @param size current number of elements (atomic for lock-free reads) + * @param shutdown has queue been shutdown? + * @param waiter_count number of threads currently waiting in queue_dequeue_wait + * @param head_lock mutex for dequeue/write operations on head + * @param tail_lock mutex for enqueue operations + * @param read_lock rwlock for read-only operations (peek, foreach) + * @param not_empty condition variable signaled when queue becomes non-empty + * @param node_pool free list of reusable nodes for performance + * @param pool_size current size of node pool + * @param pool_lock mutex for node pool access + * @param max_pool_size maximum nodes to keep in pool + */ +typedef struct +{ + queue_node_t *head; + queue_node_t *tail; + queue_node_t *dummy; + _Atomic(size_t) size; + _Atomic(int) shutdown; + _Atomic(int) waiter_count; + pthread_mutex_t head_lock; + pthread_mutex_t tail_lock; + pthread_rwlock_t read_lock; + pthread_cond_t not_empty; + queue_node_t *node_pool; + _Atomic(size_t) pool_size; + pthread_mutex_t pool_lock; + size_t max_pool_size; +} queue_t; + +/** + * queue_new + * create a new queue + * @return pointer to new queue, NULL on failure + */ +queue_t *queue_new(void); + +/** + * queue_enqueue + * add an item to the back of the queue + * @param queue the queue + * @param data pointer to data to enqueue + * @return 0 on success, -1 on failure + */ +int queue_enqueue(queue_t *queue, void *data); + +/** + * queue_dequeue + * remove and return item from front of queue + * @param queue the queue + * @return pointer to dequeued data, NULL if queue is empty + */ +void *queue_dequeue(queue_t *queue); + +/** + * queue_dequeue_wait + * remove and return item from front of queue, blocking until available + * @param queue the queue + * @return pointer to dequeued data, NULL if queue is destroyed or on error + */ +void *queue_dequeue_wait(queue_t *queue); + +/** + * queue_peek + * view item at front of queue without removing it + * @param queue the queue + * @return pointer to front data, NULL if queue is empty + */ +void *queue_peek(queue_t *queue); + +/** + * queue_size + * get current number of items in queue + * @param queue the queue + * @return number of items, 0 if queue is NULL or empty + */ +size_t queue_size(queue_t *queue); + +/** + * queue_is_empty + * check if queue is empty + * @param queue the queue + * @return 1 if empty, 0 if not empty, -1 on error + */ +int queue_is_empty(queue_t *queue); + +/** + * queue_clear + * remove all items from queue without freeing the data + * @param queue the queue + * @return 0 on success, -1 on error + */ +int queue_clear(queue_t *queue); + +/** + * queue_foreach + * iterate over all items in the queue and call function for each + * does not remove items from queue + * @param queue the queue + * @param fn callback function called for each item (receives data pointer and user context) + * @param context user-provided context passed to callback function + * @return number of items processed, -1 on error + */ +int queue_foreach(queue_t *queue, void (*fn)(void *data, void *context), void *context); + +/** + * queue_peek_at + * peek at item at specific index without removing it + * index 0 is head (oldest), index size-1 is tail (newest) + * @param queue the queue + * @param index the index to peek at + * @return pointer to data at index, NULL if index out of bounds or error + */ +void *queue_peek_at(queue_t *queue, size_t index); + +/** + * queue_snapshot + * copy all data pointers into a caller-provided array in a single O(n) traversal. + * acquires read lock once, avoiding the O(n^2) cost of repeated queue_peek_at calls. + * @param queue the queue + * @param out array to fill (must have room for at least max_items elements) + * @param max_items maximum number of items to copy + * @return number of items actually copied + */ +size_t queue_snapshot(queue_t *queue, void **out, size_t max_items); + +/** + * queue_remove_if + * remove every item where predicate(data, context) returns non-zero. acquires the same + * wrlock + head_lock + tail_lock combination as queue_clear so dequeuers and enqueuers + * are blocked for the duration. on_remove is invoked for each removed item before its + * node is recycled, giving the caller a hook to decrement counters or free the data. + * @param queue the queue + * @param predicate returns non-zero for items to remove + * @param context user-provided context passed to predicate and on_remove + * @param on_remove optional callback invoked per removed item (NULL to skip) + * @return number of items removed + */ +size_t queue_remove_if(queue_t *queue, int (*predicate)(void *data, void *context), void *context, + void (*on_remove)(void *data, void *context)); + +/** + * queue_shutdown + * signal shutdown to all waiting threads without freeing the queue + * threads blocked in queue_dequeue_wait will return NULL + * @param queue the queue to shutdown + */ +void queue_shutdown(queue_t *queue); + +/** + * queue_free + * free the queue structure (does not free the data pointers) + * @param queue the queue to free + */ +void queue_free(queue_t *queue); + +/** + * queue_free_with_data + * free the queue and all data using provided free function + * @param queue the queue to free + * @param free_fn function to free each data element (can be NULL to skip) + */ +void queue_free_with_data(queue_t *queue, void (*free_fn)(void *)); + +#endif /* __QUEUE_H__ */ \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/skip_list.c b/storage/tidesdb/libtidesdb/src/skip_list.c new file mode 100644 index 0000000000000..b92ab75f4e9e2 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/skip_list.c @@ -0,0 +1,2847 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "skip_list.h" + +/* thread-local cache for arena slot assignment + * each thread caches its slot for one arena at a time + * if the arena changes, we must get a new slot from that arena */ +static _Thread_local skip_list_arena_t *tl_cached_arena = NULL; +static _Thread_local int tl_arena_slot = -1; + +/** + * skip_list_arena_create_block + * creates a new arena block with the given capacity + * @param capacity size in bytes for the block + * @return pointer to block, or NULL on failure + */ +static skip_list_arena_block_t *skip_list_arena_create_block(const size_t capacity) +{ + skip_list_arena_block_t *block = malloc(sizeof(skip_list_arena_block_t)); + if (block == NULL) return NULL; + + block->data = malloc(capacity); + if (block->data == NULL) + { + free(block); + return NULL; + } + + atomic_init(&block->used, 0); + block->capacity = capacity; + block->prev = NULL; + + return block; +} + +/** + * skip_list_arena_register_block + * adds a block to the arena's all_blocks_head list for later destruction + * @param arena the arena + * @param block the block to register + */ +static void skip_list_arena_register_block(skip_list_arena_t *arena, skip_list_arena_block_t *block) +{ + skip_list_arena_block_t *head; + do + { + head = atomic_load_explicit(&arena->all_blocks_head, memory_order_acquire); + block->prev = head; + } while (!atomic_compare_exchange_weak_explicit(&arena->all_blocks_head, &head, block, + memory_order_release, memory_order_acquire)); +} + +/** + * skip_list_arena_create + * creates a new arena with an initial block of the given capacity + * @param initial_capacity size in bytes for the first block + * @return pointer to arena, or NULL on failure + */ +static skip_list_arena_t *skip_list_arena_create(const size_t initial_capacity) +{ + skip_list_arena_t *arena = malloc(sizeof(skip_list_arena_t)); + if (arena == NULL) return NULL; + + skip_list_arena_block_t *block = skip_list_arena_create_block(initial_capacity); + if (block == NULL) + { + free(arena); + return NULL; + } + + atomic_init(&arena->current_block, block); + arena->block_size = initial_capacity; + atomic_init(&arena->tl_slot_counter, 0); + atomic_init(&arena->all_blocks_head, block); + + for (int i = 0; i < SKIP_LIST_ARENA_MAX_THREADS; i++) + { + atomic_init(&arena->tl_blocks[i], NULL); + } + + return arena; +} + +/** + * skip_list_arena_get_slot + * gets or assigns a thread-local slot for this thread and arena + * the slot is cached per-thread but invalidated when switching arenas + * @param arena the arena + * @return slot index (0 to SKIP_LIST_ARENA_MAX_THREADS-1), or -1 if slots exhausted + */ +static inline int skip_list_arena_get_slot(skip_list_arena_t *arena) +{ + /* fast path -- cached slot for this arena */ + if (SKIP_LIST_LIKELY(tl_cached_arena == arena && tl_arena_slot >= 0)) + { + return tl_arena_slot; + } + + /* different arena or first allocation -- get a new slot */ + int slot = atomic_fetch_add_explicit(&arena->tl_slot_counter, 1, memory_order_relaxed); + if (slot >= SKIP_LIST_ARENA_MAX_THREADS) + { + return -1; + } + + tl_cached_arena = arena; + tl_arena_slot = slot; + return slot; +} + +/** + * skip_list_arena_alloc + * thread-local bump allocation from the arena + * each thread gets its own block -- no atomic contention on the fast path + * only block allocation requires synchronization (rare) + * @param arena the arena + * @param size number of bytes to allocate + * @return pointer to aligned memory, or NULL on failure + */ +static void *skip_list_arena_alloc(skip_list_arena_t *arena, size_t size) +{ + /* align up to SKIP_LIST_ARENA_ALIGNMENT */ + size = (size + (SKIP_LIST_ARENA_ALIGNMENT - 1)) & ~(size_t)(SKIP_LIST_ARENA_ALIGNMENT - 1); + + int slot = skip_list_arena_get_slot(arena); + + if (SKIP_LIST_LIKELY(slot >= 0)) + { + /* fast path -- thread-local block with no atomic contention */ + skip_list_arena_block_t *block = + atomic_load_explicit(&arena->tl_blocks[slot], memory_order_relaxed); + + if (SKIP_LIST_LIKELY(block != NULL)) + { + /* a thread-local block is owned by exactly one thread (this slot) and its + * `used` is never read by arena destroy, so relaxed is sufficient -- this + * drops two seq_cst fences from the hottest allocation path */ + size_t used = atomic_load_explicit(&block->used, memory_order_relaxed); + if (SKIP_LIST_LIKELY(used + size <= block->capacity)) + { + atomic_store_explicit(&block->used, used + size, memory_order_relaxed); + return block->data + used; + } + } + + /* thread-local block is NULL or full -- allocate a new one + * use smaller blocks for thread-local slots to save memory on multi-threaded systems */ + size_t new_cap = SKIP_LIST_ARENA_TL_BLOCK_SIZE; + if (size > new_cap) new_cap = size; + + skip_list_arena_block_t *new_block = skip_list_arena_create_block(new_cap); + if (new_block == NULL) return NULL; + + atomic_store_explicit(&new_block->used, size, memory_order_relaxed); + atomic_store_explicit(&arena->tl_blocks[slot], new_block, memory_order_relaxed); + skip_list_arena_register_block(arena, new_block); + + return new_block->data; + } + + /* fallback -- too many threads, use shared block with atomic contention */ + while (1) + { + skip_list_arena_block_t *block = + atomic_load_explicit(&arena->current_block, memory_order_acquire); + size_t offset = atomic_fetch_add_explicit(&block->used, size, memory_order_relaxed); + + if (SKIP_LIST_LIKELY(offset + size <= block->capacity)) + { + return block->data + offset; + } + + /* block full -- allocate a new shared block */ + size_t new_cap = arena->block_size; + if (size > new_cap) new_cap = size; + + skip_list_arena_block_t *new_block = skip_list_arena_create_block(new_cap); + if (new_block == NULL) return NULL; + + if (!atomic_compare_exchange_strong_explicit(&arena->current_block, &block, new_block, + memory_order_release, memory_order_acquire)) + { + free(new_block->data); + free(new_block); + } + else + { + skip_list_arena_register_block(arena, new_block); + } + } +} + +/** + * skip_list_arena_destroy + * frees the arena and all its blocks + * @param arena the arena to destroy + */ +static void skip_list_arena_destroy(skip_list_arena_t *arena) +{ + if (arena == NULL) return; + + /* free all blocks from the all_blocks_head list */ + skip_list_arena_block_t *block = + atomic_load_explicit(&arena->all_blocks_head, memory_order_relaxed); + while (block != NULL) + { + skip_list_arena_block_t *prev = block->prev; + free(block->data); + free(block); + block = prev; + } + free(arena); +} + +/** + * skip_list_alloc + * allocates memory from the arena if present, otherwise from malloc + * @param list skip list (used to check for arena) + * @param size number of bytes + * @return pointer to memory, or NULL on failure + */ +static inline void *skip_list_alloc(const skip_list_t *list, size_t size) +{ + if (list != NULL && list->arena != NULL) + { + return skip_list_arena_alloc(list->arena, size); + } + return malloc(size); +} + +/** + * skip_list_dealloc + * frees memory -- no-op when arena is active (bulk free on arena destroy) + * @param list skip list (used to check for arena) + * @param ptr pointer to free + */ +static inline void skip_list_dealloc(const skip_list_t *list, void *ptr) +{ + if (list != NULL && list->arena != NULL) return; /* no-op */ + free(ptr); +} + +/** + * skip_list_compare_keys_numeric_inline + * fast inline comparison for 8-byte numeric keys + * @param key1 first key + * @param key2 second key + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +static inline int skip_list_compare_keys_numeric_inline(const uint8_t *key1, const uint8_t *key2) +{ + uint64_t v1, v2; + memcpy(&v1, key1, sizeof(uint64_t)); + memcpy(&v2, key2, sizeof(uint64_t)); + return (v1 < v2) ? -1 : (v1 > v2); +} + +/* portable byte-swap for lexicographic integer comparison on little-endian. + * memcmp compares bytes left-to-right (big-endian order), so we byte-swap + * before integer comparison to match memcmp semantics on little-endian. */ +#if defined(__GNUC__) || defined(__clang__) +#define SKIP_LIST_BSWAP32(x) __builtin_bswap32(x) +#define SKIP_LIST_BSWAP64(x) __builtin_bswap64(x) +#elif defined(_MSC_VER) +#define SKIP_LIST_BSWAP32(x) _byteswap_ulong(x) +#define SKIP_LIST_BSWAP64(x) _byteswap_uint64(x) +#else +static inline uint32_t SKIP_LIST_BSWAP32(uint32_t x) +{ + return ((x >> 24) & 0xFF) | ((x >> 8) & 0xFF00) | ((x << 8) & 0xFF0000) | + ((x << 24) & 0xFF000000); +} +static inline uint64_t SKIP_LIST_BSWAP64(uint64_t x) +{ + return ((uint64_t)SKIP_LIST_BSWAP32((uint32_t)x) << 32) | + SKIP_LIST_BSWAP32((uint32_t)(x >> 32)); +} +#endif + +/* detect endianness at compile time */ +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define SKIP_LIST_IS_BIG_ENDIAN 1 +#else +#define SKIP_LIST_IS_BIG_ENDIAN 0 +#endif + +/* stack-allocated update array size for the batch/put paths; lists taller than this + * fall back to a heap update array. file-scope so it is defined exactly once. */ +#define SKIP_LIST_STACK_UPDATE_SIZE 64 + +/** + * skip_list_compare_keys_4_inline + * fast inline lexicographic comparison for 4-byte keys + * uses byte-swapped integer comparison to avoid memcmp function call + */ +static inline int skip_list_compare_keys_4_inline(const uint8_t *key1, const uint8_t *key2) +{ + uint32_t a, b; + memcpy(&a, key1, 4); + memcpy(&b, key2, 4); +#if !SKIP_LIST_IS_BIG_ENDIAN + a = SKIP_LIST_BSWAP32(a); + b = SKIP_LIST_BSWAP32(b); +#endif + return (a < b) ? -1 : (a > b); +} + +/** + * skip_list_compare_keys_8_inline + * fast inline lexicographic comparison for 8-byte keys + * uses byte-swapped integer comparison to avoid memcmp function call + */ +static inline int skip_list_compare_keys_8_inline(const uint8_t *key1, const uint8_t *key2) +{ + uint64_t a, b; + memcpy(&a, key1, 8); + memcpy(&b, key2, 8); +#if !SKIP_LIST_IS_BIG_ENDIAN + a = SKIP_LIST_BSWAP64(a); + b = SKIP_LIST_BSWAP64(b); +#endif + return (a < b) ? -1 : (a > b); +} + +/** + * skip_list_compare_keys_16_inline + * fast inline lexicographic comparison for 16-byte keys + * compares first 8 bytes with early exit, avoiding second half when keys diverge early + */ +static inline int skip_list_compare_keys_16_inline(const uint8_t *key1, const uint8_t *key2) +{ + uint64_t a, b; + memcpy(&a, key1, 8); + memcpy(&b, key2, 8); +#if !SKIP_LIST_IS_BIG_ENDIAN + a = SKIP_LIST_BSWAP64(a); + b = SKIP_LIST_BSWAP64(b); +#endif + if (a != b) return (a < b) ? -1 : 1; + + memcpy(&a, key1 + 8, 8); + memcpy(&b, key2 + 8, 8); +#if !SKIP_LIST_IS_BIG_ENDIAN + a = SKIP_LIST_BSWAP64(a); + b = SKIP_LIST_BSWAP64(b); +#endif + return (a < b) ? -1 : (a > b); +} + +/** + * skip_list_compare_keys_32_inline + * fast inline lexicographic comparison for 32-byte keys + * compares in 8-byte chunks with early exit + */ +static inline int skip_list_compare_keys_32_inline(const uint8_t *key1, const uint8_t *key2) +{ + for (int i = 0; i < 32; i += 8) + { + uint64_t a, b; + memcpy(&a, key1 + i, 8); + memcpy(&b, key2 + i, 8); +#if !SKIP_LIST_IS_BIG_ENDIAN + a = SKIP_LIST_BSWAP64(a); + b = SKIP_LIST_BSWAP64(b); +#endif + if (a != b) return (a < b) ? -1 : 1; + } + return 0; +} + +/** + * skip_list_get_latest_valid_version + * fast path for accessing the latest valid version + * @param version version to check + * @param current_time current time for TTL validation + * @return latest valid version, or NULL if none + */ +static inline int skip_list_version_is_invalid_with_time(skip_list_version_t *version, + int64_t current_time); + +static inline skip_list_version_t *skip_list_get_latest_valid_version(skip_list_node_t *node, + const int64_t current_time) +{ + skip_list_version_t *version = atomic_load_explicit(&node->versions, memory_order_acquire); + + if (SKIP_LIST_UNLIKELY(version == NULL)) return NULL; + skip_list_version_t *next = atomic_load_explicit(&version->next, memory_order_relaxed); + if (SKIP_LIST_LIKELY(next == NULL)) + { + if (!skip_list_version_is_invalid_with_time(version, current_time)) + { + return version; + } + return NULL; + } + + while (version != NULL) + { + if (!skip_list_version_is_invalid_with_time(version, current_time)) + { + return version; + } + version = atomic_load_explicit(&version->next, memory_order_acquire); + } + + return NULL; +} + +/** + * skip_list_free_version + * frees a single version + * @param list skip list (used to check for arena) + * @param version version to free + */ +static void skip_list_free_version(const skip_list_t *list, skip_list_version_t *version); + +/** + * skip_list_compare_keys_with_type + * hot-path comparator that accepts cmp_type as a register parameter + * avoids reloading list->cmp_type from memory across function-call barriers (memcmp etc.) + * callers in traversal loops should cache list->cmp_type in a local and use this variant + */ +static inline int skip_list_compare_keys_with_type(const skip_list_cmp_type_t cmp_type, + const skip_list_t *list, const uint8_t *key1, + const size_t key1_size, const uint8_t *key2, + const size_t key2_size) +{ + /* fast path for most common case -- memcmp with equal-sized keys */ + if (SKIP_LIST_LIKELY(cmp_type == SKIP_LIST_CMP_MEMCMP)) + { + if (SKIP_LIST_LIKELY(key1_size == key2_size)) + { + /* we use switch for common key sizes to avoid memcmp function call overhead. + * 4/8 byte keys use byte-swapped integer comparison (no function call). + * 16/32 byte keys use chunked comparison with early exit. */ + switch (key1_size) + { + case 4: + return skip_list_compare_keys_4_inline(key1, key2); + case 8: + return skip_list_compare_keys_8_inline(key1, key2); + case 16: + return skip_list_compare_keys_16_inline(key1, key2); + case 32: + return skip_list_compare_keys_32_inline(key1, key2); + default: + { + const int cmp = memcmp(key1, key2, key1_size); + return (cmp == 0) ? 0 : ((cmp < 0) ? -1 : 1); + } + } + } + return skip_list_comparator_memcmp(key1, key1_size, key2, key2_size, NULL); + } + + /* slow path for other comparator types */ + switch (cmp_type) + { + case SKIP_LIST_CMP_NUMERIC: + return skip_list_compare_keys_numeric_inline(key1, key2); + + case SKIP_LIST_CMP_STRING: + return skip_list_comparator_string(key1, key1_size, key2, key2_size, NULL); + + case SKIP_LIST_CMP_CUSTOM: + default: + return list->comparator(key1, key1_size, key2, key2_size, list->comparator_ctx); + } +} + +/** + * skip_list_get_current_time + * gets current time using cached time if available, otherwise syscall + * @param list skip list (may be NULL) + * @return current time as int64_t for consistent 64-bit handling + */ +static inline time_t skip_list_get_current_time(const skip_list_t *list) +{ +#if defined(__MINGW32__) && !defined(__MINGW64__) + /* on MinGW x86, cached time has visibility issues across threads, it seems to be a compiler bug + ******** + */ + (void)list; + return time(NULL); +#else + if (list != NULL && list->cached_time != NULL) + { + return atomic_load_explicit(list->cached_time, memory_order_relaxed); + } + return time(NULL); +#endif +} + +/** + * skip_list_version_is_invalid_with_time + * checks if version is expired or deleted using provided time + * @param version version to check + * @param current_time current time to use for TTL check + * @return 1 if invalid, 0 if valid + */ +static inline int skip_list_version_is_invalid_with_time(skip_list_version_t *version, + const int64_t current_time) +{ + if (version == NULL) return 1; + if (VERSION_IS_DELETED(version)) return 1; + if (version->ttl > 0 && version->ttl < current_time) return 1; + return 0; +} + +/** + * skip_list_validate_sequence + * validates that new sequence number does not duplicate an existing version + * @param existing_version existing version to check against + * @param new_seq new sequence number + * @return 0 if valid (new_seq != existing), -1 if duplicate + */ +static inline int skip_list_validate_sequence(skip_list_version_t *existing_version, + uint64_t new_seq) +{ + if (existing_version != NULL) + { + uint64_t existing_seq = atomic_load_explicit(&existing_version->seq, memory_order_acquire); + if (new_seq == existing_seq) return -1; + } + return 0; +} + +/** + * skip_list_insert_version_cas + * inserts a new version into a version chain maintaining descending seq order + * handles out-of-order arrivals from concurrent transaction commits by inserting + * at the correct position in the chain rather than only at the head + * @param versions_ptr pointer to atomic version list head + * @param new_version version to insert + * @param seq sequence number (for validation) + * @param list skip list (for total_size update) + * @param value_size size of new value + * @return 0 on success, -1 on failure (duplicate seq) + */ +static int skip_list_insert_version_cas(_Atomic(skip_list_version_t *) *versions_ptr, + skip_list_version_t *new_version, const uint64_t seq, + skip_list_t *list, size_t value_size) +{ + skip_list_version_t *old_head; + while (1) + { + old_head = atomic_load_explicit(versions_ptr, memory_order_acquire); + + if (old_head == NULL || seq > atomic_load_explicit(&old_head->seq, memory_order_acquire)) + { + /* normal case -- new version is newest, prepend at head */ + atomic_store_explicit(&new_version->next, old_head, memory_order_relaxed); + if (atomic_compare_exchange_weak_explicit(versions_ptr, &old_head, new_version, + memory_order_release, memory_order_acquire)) + { + /* head prepend succeeded -- update total_size, subtract old head, add new */ + if (old_head && old_head->value_size > 0) + { + atomic_fetch_sub_explicit(&list->total_size, old_head->value_size, + memory_order_relaxed); + } + atomic_fetch_add_explicit(&list->total_size, value_size, memory_order_relaxed); + return 0; + } + /* CAS failed, retry from top */ + continue; + } + + uint64_t head_seq = atomic_load_explicit(&old_head->seq, memory_order_acquire); + if (seq == head_seq) + { + /* duplicate sequence -- reject */ + skip_list_free_version(list, new_version); + return -1; + } + + /* out-of-order arrival -- walk chain to find correct insertion point + * chain is descending by seq, so find first node where next->seq < seq + * then insert between current and next. + * for out-of-order inserts we cannot use head CAS, so we retry from the top + * if the head changed. we insert by splicing into the chain. */ + skip_list_version_t *prev = old_head; + skip_list_version_t *curr = atomic_load_explicit(&prev->next, memory_order_acquire); + + while (curr != NULL) + { + uint64_t curr_seq = atomic_load_explicit(&curr->seq, memory_order_acquire); + if (seq == curr_seq) + { + /* duplicate in chain */ + skip_list_free_version(list, new_version); + return -1; + } + if (seq > curr_seq) + { + break; /* insert between prev and curr */ + } + prev = curr; + curr = atomic_load_explicit(&prev->next, memory_order_acquire); + } + + /* splice new_version between prev and curr */ + atomic_store_explicit(&new_version->next, curr, memory_order_relaxed); + skip_list_version_t *expected_curr = curr; + if (!atomic_compare_exchange_strong_explicit(&prev->next, &expected_curr, new_version, + memory_order_release, memory_order_acquire)) + { + /* chain was modified concurrently, retry from top */ + continue; + } + + /* successfully inserted in middle/tail -- we update total_size */ + atomic_fetch_add_explicit(&list->total_size, value_size, memory_order_relaxed); + return 0; + } +} + +int skip_list_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + size_t min_size = key1_size < key2_size ? key1_size : key2_size; + const int cmp = memcmp(key1, key2, min_size); + if (cmp != 0) return cmp < 0 ? -1 : 1; + return (key1_size < key2_size) ? -1 : (key1_size > key2_size) ? 1 : 0; +} + +int skip_list_comparator_string(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + /* length-bounded compare keys are byte buffers, not guaranteed NUL-terminated. + * strcmp here would read past the buffer on a non-terminated key. memcmp over the + * shorter length plus a length tie-break gives the same order as strcmp for + * well-formed C-string keys while staying in bounds. */ + const size_t min_size = key1_size < key2_size ? key1_size : key2_size; + const int cmp = memcmp(key1, key2, min_size); + if (cmp != 0) return cmp < 0 ? -1 : 1; + if (key1_size < key2_size) return -1; + if (key1_size > key2_size) return 1; + return 0; +} + +int skip_list_comparator_numeric(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)key1_size; + (void)key2_size; + (void)ctx; + uint64_t val1, val2; + memcpy(&val1, key1, sizeof(uint64_t)); + memcpy(&val2, key2, sizeof(uint64_t)); + if (val1 < val2) return -1; + if (val1 > val2) return 1; + return 0; +} + +/** + * skip_list_create_version + * creates a new version for a key + * @param list skip list (for arena allocation) + * @param value value data + * @param value_size size of value + * @param ttl time-to-live + * @param flags version flags (bitmask of SKIP_LIST_FLAG_*) + * @param seq sequence number for MVCC + * @return pointer to new version, NULL on failure + */ +static skip_list_version_t *skip_list_create_version(const skip_list_t *list, const uint8_t *value, + const size_t value_size, const int64_t ttl, + const uint8_t flags, uint64_t seq) +{ + /* we combine version struct + value data into a single allocation + * this halves malloc calls and improves cache locality */ + const size_t alloc_size = + sizeof(skip_list_version_t) + ((value != NULL && value_size > 0) ? value_size : 0); + skip_list_version_t *version = (skip_list_version_t *)skip_list_alloc(list, alloc_size); + if (version == NULL) return NULL; + + if (value != NULL && value_size > 0) + { + version->value = (uint8_t *)(version + 1); /* value follows struct in same allocation */ + memcpy(version->value, value, value_size); + version->value_size = value_size; + } + else + { + version->value = NULL; + version->value_size = 0; + } + + atomic_init(&version->flags, flags); + atomic_init(&version->seq, seq); + version->ttl = ttl; + atomic_init(&version->next, NULL); + return version; +} + +/** + * skip_list_free_version + * frees a single version + * @param list skip list (for arena deallocation) + * @param version version to free + */ +static void skip_list_free_version(const skip_list_t *list, skip_list_version_t *version) +{ + if (version == NULL) return; + /* value is embedded in same allocation as version struct -- single free */ + skip_list_dealloc(list, version); +} + +/** + * skip_list_free_version_list + * frees a linked list of versions + * @param list skip list (for arena deallocation) + * @param head head of version list + */ +static void skip_list_free_version_list(const skip_list_t *list, skip_list_version_t *head) +{ + while (head != NULL) + { + skip_list_version_t *next = atomic_load_explicit(&head->next, memory_order_acquire); + skip_list_free_version(list, head); + head = next; + } +} + +/** + * skip_list_create_sentinel + * creates a sentinel node (header or tail) + * @param level level of the node + * @return pointer to new sentinel node, NULL on failure + */ +static skip_list_node_t *skip_list_create_sentinel(const int level) +{ + size_t pointers_size = (level + 1) * 2 * sizeof(_Atomic(skip_list_node_t *)); + skip_list_node_t *node = (skip_list_node_t *)malloc(sizeof(skip_list_node_t) + pointers_size); + if (node == NULL) return NULL; + + node->key = NULL; + node->key_size = 0; + node->level = (uint8_t)level; + node->node_flags = SKIP_LIST_NODE_FLAG_SENTINEL; + atomic_init(&node->versions, NULL); + + for (int i = 0; i <= level; i++) + { + atomic_init(&node->forward[i], NULL); + atomic_init(&BACKWARD_PTR(node, i, level), NULL); + } + + return node; +} + +skip_list_node_t *skip_list_create_node(const int level, const uint8_t *key, size_t key_size, + const uint8_t *value, const size_t value_size, + const int64_t ttl, const uint8_t flags) +{ + if (key == NULL || key_size == 0) return NULL; + + /* we combine node struct + forward/backward pointers + key into a single allocation + * this eliminates one malloc per node and co-locates key data for cache locality */ + size_t pointers_size = (level + 1) * 2 * sizeof(_Atomic(skip_list_node_t *)); + skip_list_node_t *node = + (skip_list_node_t *)malloc(sizeof(skip_list_node_t) + pointers_size + key_size); + if (node == NULL) return NULL; + + node->key = (uint8_t *)node + sizeof(skip_list_node_t) + pointers_size; + memcpy(node->key, key, key_size); + node->key_size = key_size; + node->level = (uint8_t)level; + node->node_flags = 0; /* not a sentinel */ + + const int is_tombstone = (flags & SKIP_LIST_FLAG_DELETED) != 0; + skip_list_version_t *initial_version = NULL; + if (value != NULL || is_tombstone) + { + initial_version = skip_list_create_version(NULL, value, value_size, ttl, flags, 0); + if (initial_version == NULL) + { + /* for non-tombstones, version creation failure is fatal + * for tombstones, NULL version is acceptable */ + if (!is_tombstone) + { + free(node); + return NULL; + } + } + } + atomic_init(&node->versions, initial_version); + + for (int i = 0; i <= level; i++) + { + atomic_init(&node->forward[i], NULL); + atomic_init(&BACKWARD_PTR(node, i, level), NULL); + } + + return node; +} + +/** + * skip_list_free_node_internal + * arena-aware node free -- simply no-op when arena is active + */ +static int skip_list_free_node_internal(const skip_list_t *list, skip_list_node_t *node) +{ + if (node == NULL) return -1; + skip_list_version_t *versions = atomic_load_explicit(&node->versions, memory_order_acquire); + skip_list_free_version_list(list, versions); + /* key is embedded in same allocation as node -- single free */ + skip_list_dealloc(list, node); + return 0; +} + +int skip_list_free_node(skip_list_node_t *node) +{ + if (node == NULL) return -1; + skip_list_version_t *versions = atomic_load_explicit(&node->versions, memory_order_acquire); + + while (versions != NULL) + { + skip_list_version_t *next = atomic_load_explicit(&versions->next, memory_order_acquire); + free(versions); + versions = next; + } + free(node); + return 0; +} + +int skip_list_new(skip_list_t **list, const int max_level, const float probability) +{ + return skip_list_new_with_comparator(list, max_level, probability, skip_list_comparator_memcmp, + NULL); +} + +int skip_list_new_with_comparator(skip_list_t **list, int max_level, float probability, + skip_list_comparator_fn comparator, void *comparator_ctx) +{ + return skip_list_new_with_comparator_and_cached_time(list, max_level, probability, comparator, + comparator_ctx, NULL); +} + +int skip_list_new_with_comparator_and_cached_time(skip_list_t **list, const int max_level, + const float probability, + skip_list_comparator_fn comparator, + void *comparator_ctx, + _Atomic(time_t) *cached_time) +{ + if (list == NULL || max_level <= 0 || probability <= 0.0f || probability >= 1.0f) return -1; + + skip_list_t *new_list = (skip_list_t *)malloc(sizeof(skip_list_t)); + if (new_list == NULL) return -1; + + atomic_init(&new_list->level, 0); + new_list->max_level = max_level; + new_list->probability = probability; + + /* we determine comparator typen */ + if (comparator == skip_list_comparator_memcmp) + { + new_list->cmp_type = SKIP_LIST_CMP_MEMCMP; + } + else if (comparator == skip_list_comparator_string) + { + new_list->cmp_type = SKIP_LIST_CMP_STRING; + } + else if (comparator == skip_list_comparator_numeric) + { + new_list->cmp_type = SKIP_LIST_CMP_NUMERIC; + } + else + { + new_list->cmp_type = SKIP_LIST_CMP_CUSTOM; + } + + new_list->comparator = comparator; + new_list->comparator_ctx = comparator_ctx; + new_list->cached_time = cached_time; + new_list->arena = NULL; + + if (cached_time != NULL) + { + atomic_store_explicit(cached_time, tdb_get_current_time(), memory_order_seq_cst); + } + + atomic_init(&new_list->total_size, 0); + atomic_init(&new_list->entry_count, 0); + + /* we create sentinel nodes with no keys -- they are identified by the sentinel flag */ + skip_list_node_t *header = skip_list_create_sentinel(max_level); + skip_list_node_t *tail = skip_list_create_sentinel(max_level); + + if (header == NULL || tail == NULL) + { + if (header) skip_list_free_node(header); + if (tail) skip_list_free_node(tail); + free(new_list); + return -1; + } + + for (int i = 0; i <= max_level; i++) + { + atomic_store_explicit(&header->forward[i], tail, memory_order_relaxed); + atomic_store_explicit(&BACKWARD_PTR(tail, i, max_level), header, memory_order_relaxed); + } + + atomic_init(&new_list->header, header); + atomic_init(&new_list->tail, tail); + + *list = new_list; + return 0; +} + +int skip_list_new_with_arena(skip_list_t **list, const int max_level, const float probability, + skip_list_comparator_fn comparator, void *comparator_ctx, + _Atomic(time_t) *cached_time, const size_t arena_initial_capacity) +{ + if (arena_initial_capacity == 0) + { + return skip_list_new_with_comparator_and_cached_time( + list, max_level, probability, comparator, comparator_ctx, cached_time); + } + + int rc = skip_list_new_with_comparator_and_cached_time(list, max_level, probability, comparator, + comparator_ctx, cached_time); + if (rc != 0) return rc; + + (*list)->arena = skip_list_arena_create(arena_initial_capacity); + if ((*list)->arena == NULL) + { + skip_list_free(*list); + *list = NULL; + return -1; + } + + return 0; +} + +/** + * skip_list_xorshift64star + * fast thread-local RNG for skip list level selection using xorshift64* algorithm + * @param state pointer to thread-local RNG state + * @return pseudo-random 64-bit value + */ +static inline uint64_t skip_list_xorshift64star(uint64_t *state) +{ + uint64_t x = *state; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + *state = x; + return x * 0x2545F4914F6CDD1DULL; +} + +int skip_list_random_level(const skip_list_t *list) +{ + if (list == NULL) return -1; + + /* thread-local RNG state */ + static _Thread_local uint64_t rng_state = 0; + if (SKIP_LIST_UNLIKELY(rng_state == 0)) + { + /** we init with thread ID + address entropy for uniqueness + * avoids time() syscall on hot path */ + rng_state = (uint64_t)TDB_THREAD_ID() ^ ((uintptr_t)&rng_state >> 3); + if (rng_state == 0) rng_state = 1; /* ensure non-zero */ + } + + /* geometric level distribution for the configured probability where we promote a level + * while a fresh uniform draw stays below p. averages ~1/(1-p) draws (~1.33 at + * p=0.25), each a cheap xorshift + compare. */ + const double p = (double)list->probability; + int level = 0; + while (level < list->max_level) + { + const uint64_t rnd = skip_list_xorshift64star(&rng_state); + /* top 53 bits -> uniform double in [0, 1) */ + const double u = (double)(rnd >> 11) * (1.0 / 9007199254740992.0); + if (u >= p) break; + level++; + } + + return level; +} + +int skip_list_compare_keys(const skip_list_t *list, const uint8_t *key1, size_t key1_size, + const uint8_t *key2, size_t key2_size) +{ + if (list == NULL || key1 == NULL || key2 == NULL) return 0; + return list->comparator(key1, key1_size, key2, key2_size, list->comparator_ctx); +} + +int skip_list_check_and_update_ttl(const skip_list_t *list, skip_list_node_t *node) +{ + if (node == NULL) return -1; + skip_list_version_t *version = atomic_load_explicit(&node->versions, memory_order_acquire); + if (version != NULL && version->ttl > 0 && version->ttl <= skip_list_get_current_time(list)) + { + return 1; + } + return 0; +} + +int skip_list_get(skip_list_t *list, const uint8_t *key, const size_t key_size, uint8_t **value, + size_t *value_size, int64_t *ttl, uint8_t *deleted) +{ + if (list == NULL || key == NULL || key_size == 0 || value == NULL || value_size == NULL) + return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *current = header; + const int max_level = + atomic_load_explicit(&list->level, memory_order_acquire); /* cache level */ + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + /* we track if we found exact match at level 0 to avoid redundant comparison */ + int found_exact = 0; + skip_list_node_t *candidate = NULL; + + /* we search from top level down with prefetching + * use relaxed loads during traversal, acquire only at level 0 for final target + * prefetch fires before sentinel check so cache line is warming during condition eval */ + /* on x86 (TSO), relaxed and acquire loads compile identically. + * we use acquire uniformly to avoid a per-iteration branch */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + /* we prefetch before touching any fields -- this gives memory subsystem head start */ + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + /* non-sentinel nodes always have key != NULL, so sentinel check is sufficient */ + while (SKIP_LIST_LIKELY(next != NULL && !NODE_IS_SENTINEL(next))) + { + const int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, + next->key_size, key, key_size); + if (cmp > 0) break; + if (cmp == 0) + { + /* exact match found -- at level 0 we can skip final comparison */ + if (i == 0) + { + found_exact = 1; + candidate = next; + } + break; + } + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + /* prefetch immediately after loading pointer, before next iteration's sentinel check */ + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + skip_list_node_t *target; + if (found_exact) + { + target = candidate; + } + else + { + target = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (SKIP_LIST_UNLIKELY(target == NULL || NODE_IS_SENTINEL(target) || target->key == NULL)) + return -1; + + const int cmp = skip_list_compare_keys_with_type(cmp_type, list, target->key, + target->key_size, key, key_size); + if (SKIP_LIST_UNLIKELY(cmp != 0)) return -1; + } + + skip_list_version_t *head_version = + atomic_load_explicit(&target->versions, memory_order_acquire); + if (head_version == NULL) return -1; + + const int64_t current_time = skip_list_get_current_time(list); + int head_invalid = skip_list_version_is_invalid_with_time(head_version, current_time); + + if (head_invalid && VERSION_IS_DELETED(head_version)) + { + if (ttl != NULL) *ttl = head_version->ttl; + if (deleted != NULL) *deleted = 1; + *value = NULL; + *value_size = 0; + return 0; + } + + skip_list_version_t *version = + head_invalid ? skip_list_get_latest_valid_version(target, current_time) : head_version; + + if (version == NULL) + { + if (deleted != NULL) *deleted = 1; + if (ttl != NULL) *ttl = -1; + *value = NULL; + *value_size = 0; + return 0; + } + + if (ttl != NULL) *ttl = version->ttl; + if (deleted != NULL) *deleted = 0; + + if (version->value_size > 0 && version->value != NULL) + { + *value = (uint8_t *)malloc(version->value_size); + if (*value == NULL) return -1; + memcpy(*value, version->value, version->value_size); + *value_size = version->value_size; + } + else + { + *value = NULL; + *value_size = 0; + } + return 0; +} + +int skip_list_get_ref(skip_list_t *list, const uint8_t *key, const size_t key_size, + const uint8_t **value, size_t *value_size, int64_t *ttl, uint8_t *deleted) +{ + if (list == NULL || key == NULL || key_size == 0 || value == NULL || value_size == NULL) + return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *current = header; + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + int found_exact = 0; + skip_list_node_t *candidate = NULL; + + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (SKIP_LIST_LIKELY(next != NULL && !NODE_IS_SENTINEL(next))) + { + const int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, + next->key_size, key, key_size); + if (cmp > 0) break; + if (cmp == 0) + { + if (i == 0) + { + found_exact = 1; + candidate = next; + } + break; + } + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + skip_list_node_t *target; + if (found_exact) + { + target = candidate; + } + else + { + target = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (SKIP_LIST_UNLIKELY(target == NULL || NODE_IS_SENTINEL(target) || target->key == NULL)) + return -1; + + const int cmp = skip_list_compare_keys_with_type(cmp_type, list, target->key, + target->key_size, key, key_size); + if (SKIP_LIST_UNLIKELY(cmp != 0)) return -1; + } + + skip_list_version_t *head_version = + atomic_load_explicit(&target->versions, memory_order_acquire); + if (head_version == NULL) return -1; + + const int64_t current_time = skip_list_get_current_time(list); + int head_invalid = skip_list_version_is_invalid_with_time(head_version, current_time); + + if (head_invalid && VERSION_IS_DELETED(head_version)) + { + if (ttl != NULL) *ttl = head_version->ttl; + if (deleted != NULL) *deleted = 1; + *value = NULL; + *value_size = 0; + return 0; + } + + skip_list_version_t *version = + head_invalid ? skip_list_get_latest_valid_version(target, current_time) : head_version; + + if (version == NULL) + { + if (deleted != NULL) *deleted = 1; + if (ttl != NULL) *ttl = -1; + *value = NULL; + *value_size = 0; + return 0; + } + + if (ttl != NULL) *ttl = version->ttl; + if (deleted != NULL) *deleted = 0; + + /* zero-copy -- we simply return direct pointer into version data */ + *value = version->value; + *value_size = version->value_size; + return 0; +} + +int skip_list_delete(skip_list_t *list, const uint8_t *key, const size_t key_size, + const uint64_t seq) +{ + if (list == NULL || key == NULL || key_size == 0) return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *current = header; + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + /* we traverse with prefetching -- prefetch before sentinel check */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, next->key_size, + key, key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + skip_list_node_t *target = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (target == NULL || NODE_IS_SENTINEL(target)) return 0; + + int cmp = skip_list_compare_keys_with_type(cmp_type, list, target->key, target->key_size, key, + key_size); + if (cmp != 0) return 0; + + skip_list_version_t *tombstone = skip_list_create_version(list, NULL, 0, -1, 1, seq); + if (tombstone == NULL) return -1; + + if (skip_list_insert_version_cas(&target->versions, tombstone, seq, list, 0) != 0) + { + return -1; + } + return 0; +} + +int skip_list_clear(skip_list_t *list) +{ + if (list == NULL) return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *tail = atomic_load_explicit(&list->tail, memory_order_acquire); + + if (list->arena == NULL) + { + /* no arena -- we must walk and free each node individually */ + skip_list_node_t *current = atomic_load_explicit(&header->forward[0], memory_order_acquire); + while (current != NULL && !NODE_IS_SENTINEL(current)) + { + skip_list_node_t *next = + atomic_load_explicit(¤t->forward[0], memory_order_acquire); + skip_list_free_node(current); + current = next; + } + } + /* with arena, nodes are freed in bulk when arena is destroyed */ + + const int max_level = list->max_level; + for (int i = 0; i <= max_level; i++) + { + atomic_store_explicit(&header->forward[i], tail, memory_order_release); + atomic_store_explicit(&BACKWARD_PTR(tail, i, max_level), header, memory_order_release); + } + + atomic_store_explicit(&list->level, 0, memory_order_release); + atomic_store_explicit(&list->total_size, 0, memory_order_release); + atomic_store_explicit(&list->entry_count, 0, memory_order_release); + + return 0; +} + +void skip_list_free(skip_list_t *list) +{ + if (list == NULL) return; + + if (list->arena != NULL) + { + /* arena path -- we simply destroy arena (frees all nodes+versions in bulk), + * then free sentinels which were malloc'd before arena existed */ + skip_list_arena_destroy(list->arena); + list->arena = NULL; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *tail = atomic_load_explicit(&list->tail, memory_order_acquire); + skip_list_free_node(header); + skip_list_free_node(tail); + } + else + { + /* no arena -- we walk and free each node individually */ + skip_list_clear(list); + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *tail = atomic_load_explicit(&list->tail, memory_order_acquire); + skip_list_free_node(header); + skip_list_free_node(tail); + } + + free(list); +} + +size_t skip_list_get_size(skip_list_t *list) +{ + if (list == NULL) return 0; + return atomic_load_explicit(&list->total_size, memory_order_acquire); +} + +int skip_list_count_entries(skip_list_t *list) +{ + if (list == NULL) return -1; + return atomic_load_explicit(&list->entry_count, memory_order_acquire); +} + +int skip_list_get_min_key(skip_list_t *list, uint8_t **key, size_t *key_size) +{ + if (list == NULL || key == NULL || key_size == NULL) return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *first = atomic_load_explicit(&header->forward[0], memory_order_acquire); + + if (first == NULL || NODE_IS_SENTINEL(first)) return -1; + + /* we find first valid (non-deleted, non-expired) entry */ + const int64_t current_time = skip_list_get_current_time(list); + skip_list_node_t *current = first; + while (current != NULL && !NODE_IS_SENTINEL(current)) + { + skip_list_version_t *version = + atomic_load_explicit(¤t->versions, memory_order_acquire); + if (!skip_list_version_is_invalid_with_time(version, current_time)) + { + first = current; + break; + } + current = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + } + + if (current == NULL || NODE_IS_SENTINEL(current)) return -1; + + *key = (uint8_t *)malloc(first->key_size); + if (*key == NULL) return -1; + memcpy(*key, first->key, first->key_size); + *key_size = first->key_size; + return 0; +} + +static skip_list_node_t *skip_list_predecessor(const skip_list_t *list, skip_list_node_t *header, + const uint8_t *key, size_t key_size); + +int skip_list_get_max_key(skip_list_t *list, uint8_t **key, size_t *key_size) +{ + if (list == NULL || key == NULL || key_size == NULL) return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + + /* forward-reseek the last node, then step back via forward search (not the + * stale-prone backward pointers) until a valid (non-deleted, non-expired) + * entry or the header */ + const int64_t current_time = skip_list_get_current_time(list); + skip_list_node_t *current = skip_list_predecessor(list, header, NULL, 0); + while (current != header && !NODE_IS_SENTINEL(current)) + { + skip_list_version_t *version = + atomic_load_explicit(¤t->versions, memory_order_acquire); + if (!skip_list_version_is_invalid_with_time(version, current_time)) + { + *key = (uint8_t *)malloc(current->key_size); + if (*key == NULL) return -1; + memcpy(*key, current->key, current->key_size); + *key_size = current->key_size; + return 0; + } + current = skip_list_predecessor(list, header, current->key, current->key_size); + } + + return -1; +} + +int skip_list_cursor_init(skip_list_cursor_t **cursor, skip_list_t *list) +{ + if (cursor == NULL || list == NULL) return -1; + + *cursor = (skip_list_cursor_t *)malloc(sizeof(skip_list_cursor_t)); + if (*cursor == NULL) return -1; + + (*cursor)->list = list; + (*cursor)->cached_header = atomic_load_explicit(&list->header, memory_order_acquire); + (*cursor)->cached_tail = atomic_load_explicit(&list->tail, memory_order_acquire); + (*cursor)->current = + atomic_load_explicit(&(*cursor)->cached_header->forward[0], memory_order_acquire); + (*cursor)->current_version = NULL; + return 0; +} + +void skip_list_cursor_free(skip_list_cursor_t *cursor) +{ + if (cursor != NULL) free(cursor); +} + +int skip_list_cursor_valid(const skip_list_cursor_t *cursor) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + return (cursor->current != cursor->cached_header && cursor->current != cursor->cached_tail) ? 1 + : 0; +} + +int skip_list_cursor_next(skip_list_cursor_t *cursor) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + if (cursor->current == cursor->cached_tail) return -1; + + cursor->current = atomic_load_explicit(&cursor->current->forward[0], memory_order_acquire); + cursor->current_version = NULL; + if (cursor->current == NULL || cursor->current == cursor->cached_tail) return -1; + + /* we prefetch next node, its key, and its version to hide memory latency. + * acquire (not relaxed) -- next is dereferenced below (NODE_IS_SENTINEL, + * ->key) so it must synchronize with the release-CAS that published it */ + skip_list_node_t *next = + atomic_load_explicit(&cursor->current->forward[0], memory_order_acquire); + if (next && !NODE_IS_SENTINEL(next)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + /* we prefetch version for the current node -- cursor_get will need it */ + PREFETCH_READ(&cursor->current->versions); + + return 0; +} + +/** + * skip_list_predecessor + * forward-searches for the last node whose key is strictly less than `key`, or for + * the last node in the list when key == NULL. used for reverse navigation unlike + * the per-node backward pointers (which are maintained best-effort and can be left + * stale by concurrent inserts, so a backward walk may skip nodes), forward[0] is the + * linearizable structure, so this is always complete. + * @return the predecessor node, or the header sentinel when none exists + */ +static skip_list_node_t *skip_list_predecessor(const skip_list_t *list, skip_list_node_t *header, + const uint8_t *key, const size_t key_size) +{ + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = list->cmp_type; + skip_list_node_t *pred = header; + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(&pred->forward[i], memory_order_acquire); + while (next != NULL && !NODE_IS_SENTINEL(next) && + (key == NULL || skip_list_compare_keys_with_type(cmp_type, list, next->key, + next->key_size, key, key_size) < 0)) + { + pred = next; + next = atomic_load_explicit(&pred->forward[i], memory_order_acquire); + } + } + return pred; +} + +int skip_list_cursor_prev(skip_list_cursor_t *cursor) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + if (cursor->current == cursor->cached_header) return -1; + + skip_list_node_t *cur = cursor->current; + + /* the backward pointer is a HINT, trusted only when the forward + * list confirms it -- H is cur's true predecessor iff H->forward[0] == cur, and + * forward[0] is the linearizable source of truth. this keeps reverse steps O(1) + * when the hint is fresh (the common case) while a stale/NULL backward pointer, + * which a concurrent insert can leave behind, falls through to the reseek. */ + skip_list_node_t *hint = + atomic_load_explicit(&BACKWARD_PTR(cur, 0, cur->level), memory_order_acquire); + if (hint != NULL && atomic_load_explicit(&hint->forward[0], memory_order_acquire) == cur) + { + cursor->current = hint; + cursor->current_version = NULL; + if (hint == cursor->cached_header) return -1; + PREFETCH_READ(&hint->versions); + return 0; + } + + /* slow path -- forward-reseek the predecessor (always complete). when cur is the + * tail, the predecessor of "+infinity" is the last node (key == NULL). */ + skip_list_node_t *pred = + (cur == cursor->cached_tail) + ? skip_list_predecessor(cursor->list, cursor->cached_header, NULL, 0) + : skip_list_predecessor(cursor->list, cursor->cached_header, cur->key, cur->key_size); + + cursor->current = pred; + cursor->current_version = NULL; + if (pred == cursor->cached_header) return -1; + + PREFETCH_READ(&pred->versions); + return 0; +} + +int skip_list_cursor_advance_in_node(skip_list_cursor_t *cursor) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + if (cursor->current == cursor->cached_header || cursor->current == cursor->cached_tail) + return -1; + + /* if no version was selected yet, the next-older sits behind the head; otherwise + * walk the chain pointer from the version we are currently parked on */ + skip_list_version_t *cur = + cursor->current_version + ? cursor->current_version + : atomic_load_explicit(&cursor->current->versions, memory_order_acquire); + if (cur == NULL) return -1; + + skip_list_version_t *next_older = atomic_load_explicit(&cur->next, memory_order_acquire); + if (next_older == NULL) return -1; + + cursor->current_version = next_older; + return 0; +} + +int skip_list_cursor_get(skip_list_cursor_t *cursor, uint8_t **key, size_t *key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, uint8_t *deleted) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + + if (cursor->current == cursor->cached_tail) return -1; + + *key = cursor->current->key; + *key_size = cursor->current->key_size; + + skip_list_version_t *version = + cursor->current_version + ? cursor->current_version + : atomic_load_explicit(&cursor->current->versions, memory_order_acquire); + if (version == NULL) return -1; + + if (ttl != NULL) *ttl = version->ttl; + + /* we check if version is invalid (expired or deleted) */ + if (skip_list_version_is_invalid_with_time(version, skip_list_get_current_time(cursor->list))) + { + if (deleted != NULL) *deleted = 1; + *value = NULL; + *value_size = 0; + return 0; + } + + if (deleted != NULL) *deleted = 0; + *value = version->value; + *value_size = version->value_size; + return 0; +} + +int skip_list_cursor_get_with_seq(skip_list_cursor_t *cursor, uint8_t **key, size_t *key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, + uint8_t *deleted, uint64_t *seq) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + + if (cursor->current == cursor->cached_tail) return -1; + + *key = cursor->current->key; + *key_size = cursor->current->key_size; + + skip_list_version_t *version = + cursor->current_version + ? cursor->current_version + : atomic_load_explicit(&cursor->current->versions, memory_order_acquire); + if (version == NULL) return -1; + + if (ttl != NULL) *ttl = version->ttl; + if (seq != NULL) *seq = atomic_load_explicit(&version->seq, memory_order_acquire); + + /* *deleted returns the version flag bits (SKIP_LIST_FLAG_*) so callers can + * see single-delete and not just plain tombstone. the low bit is always set + * when the caller should treat this entry as a tombstone (tombstone or + * expired ttl), matching the old bool-like contract for existing callers. */ + const uint8_t version_flags = atomic_load_explicit(&version->flags, memory_order_acquire); + + /* we check if version is invalid (expired or deleted) */ + if (skip_list_version_is_invalid_with_time(version, skip_list_get_current_time(cursor->list))) + { + if (deleted != NULL) + { + *deleted = SKIP_LIST_FLAG_DELETED | (version_flags & SKIP_LIST_FLAG_SINGLE_DELETE); + } + *value = NULL; + *value_size = 0; + return 0; + } + + if (deleted != NULL) *deleted = 0; + *value = version->value; + *value_size = version->value_size; + return 0; +} + +int skip_list_cursor_next_get(skip_list_cursor_t *cursor, uint8_t **key, size_t *key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, uint8_t *deleted) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + if (cursor->current == cursor->cached_tail) return -1; + + /* we advance to next node */ + cursor->current = atomic_load_explicit(&cursor->current->forward[0], memory_order_acquire); + cursor->current_version = NULL; + if (cursor->current == NULL || cursor->current == cursor->cached_tail) return -1; + + /* we prefetch next node for the next call to this function. + * acquire (not relaxed) -- next is dereferenced below so it must + * synchronize with the release-CAS that published it */ + skip_list_node_t *next = + atomic_load_explicit(&cursor->current->forward[0], memory_order_acquire); + if (next && !NODE_IS_SENTINEL(next)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + PREFETCH_READ(&next->versions); + } + + /* inline get -- no redundant sentinel/NULL checks */ + *key = cursor->current->key; + *key_size = cursor->current->key_size; + + skip_list_version_t *version = + atomic_load_explicit(&cursor->current->versions, memory_order_acquire); + if (version == NULL) return -1; + + if (ttl != NULL) *ttl = version->ttl; + + if (skip_list_version_is_invalid_with_time(version, skip_list_get_current_time(cursor->list))) + { + if (deleted != NULL) *deleted = 1; + *value = NULL; + *value_size = 0; + return 0; + } + + if (deleted != NULL) *deleted = 0; + *value = version->value; + *value_size = version->value_size; + return 0; +} + +int skip_list_cursor_at_start(skip_list_cursor_t *cursor) +{ + if (cursor == NULL) return -1; + skip_list_node_t *first = + atomic_load_explicit(&cursor->cached_header->forward[0], memory_order_acquire); + return (cursor->current == first) ? 1 : 0; +} + +int skip_list_cursor_at_end(const skip_list_cursor_t *cursor) +{ + if (cursor == NULL) return -1; + return (cursor->current == cursor->cached_tail) ? 1 : 0; +} + +int skip_list_cursor_has_next(skip_list_cursor_t *cursor) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + if (cursor->current == cursor->cached_tail) return -1; + skip_list_node_t *next = + atomic_load_explicit(&cursor->current->forward[0], memory_order_acquire); + return (next != NULL && next != cursor->cached_tail) ? 1 : 0; +} + +int skip_list_cursor_has_prev(skip_list_cursor_t *cursor) +{ + if (cursor == NULL || cursor->current == NULL) return -1; + if (cursor->current == cursor->cached_tail) return -1; + skip_list_node_t *first = + atomic_load_explicit(&cursor->cached_header->forward[0], memory_order_acquire); + return (cursor->current != first && cursor->current != cursor->cached_header) ? 1 : 0; +} + +int skip_list_cursor_goto_last(skip_list_cursor_t *cursor) +{ + if (cursor == NULL) return -1; + + /* fast verified hint where the last node L satisfies L->forward[0] == tail. + * we trust the tail's backward pointer only when forward confirms it; otherwise + * forward-reseek the last node (predecessor of "+infinity"). */ + skip_list_node_t *tail = cursor->cached_tail; + skip_list_node_t *last = + atomic_load_explicit(&BACKWARD_PTR(tail, 0, tail->level), memory_order_acquire); + if (last == NULL || last == cursor->cached_header || + atomic_load_explicit(&last->forward[0], memory_order_acquire) != tail) + { + last = skip_list_predecessor(cursor->list, cursor->cached_header, NULL, 0); + } + + if (last == cursor->cached_header || NODE_IS_SENTINEL(last)) return -1; + + cursor->current = last; + cursor->current_version = NULL; + return 0; +} + +int skip_list_cursor_goto_first(skip_list_cursor_t *cursor) +{ + if (cursor == NULL) return -1; + skip_list_node_t *first = + atomic_load_explicit(&cursor->cached_header->forward[0], memory_order_acquire); + if (first == NULL || NODE_IS_SENTINEL(first)) return -1; + cursor->current = first; + cursor->current_version = NULL; + return 0; +} + +/** + * skip_list_cursor_seek + * positions cursor at the node before the first key >= target + * @param cursor the cursor to position + * @param key the target key to seek to + * @param key_size size of the target key + * @return 0 on success, -1 on failure + * + * after calling this function, cursor->current points to the predecessor node. + * callers must call skip_list_cursor_next() or similar to access the actual target key. + * this behavior allows efficient insertion and supports both exact matches and range queries. + */ +int skip_list_cursor_seek(skip_list_cursor_t *cursor, const uint8_t *key, const size_t key_size) +{ + if (cursor == NULL || key == NULL || key_size == 0) return -1; + + skip_list_node_t *current = cursor->cached_header; + const int max_level = + atomic_load_explicit(&cursor->list->level, memory_order_acquire); /* cache level */ + const skip_list_cmp_type_t cmp_type = cursor->list->cmp_type; + + /* we find the node before the target key */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, cursor->list, next->key, + next->key_size, key, key_size); + if (cmp >= 0) break; /* we stop before target or equal */ + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + /* we position cursor at the node before target + * caller must call skip_list_cursor_next() to access first key >= target */ + cursor->current = current; + cursor->current_version = NULL; + return 0; +} + +int skip_list_cursor_seek_ge(skip_list_cursor_t *cursor, const uint8_t *key, const size_t key_size) +{ + if (cursor == NULL || key == NULL || key_size == 0) return -1; + + skip_list_node_t *current = cursor->cached_header; + const int max_level = atomic_load_explicit(&cursor->list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = cursor->list->cmp_type; + + /* we find the node before target */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, cursor->list, next->key, + next->key_size, key, key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + } + } + + /* we land directly on the first entry >= target rather than parking before it and + * leaving a separate next() to read forward[0]. a concurrent put can splice a node + * whose key is < target into forward[0] in that window, so a seek+next pair can + * return a key below target; re-reading forward[0] until we pass target closes it. + * once current points at a node >= target, later sub-target inserts splice in before + * it and do not move the cursor. */ + for (;;) + { + skip_list_node_t *nx = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (nx == NULL || NODE_IS_SENTINEL(nx)) + { + cursor->current = nx; + cursor->current_version = NULL; + return -1; + } + if (skip_list_compare_keys_with_type(cmp_type, cursor->list, nx->key, nx->key_size, key, + key_size) >= 0) + { + cursor->current = nx; + cursor->current_version = NULL; + return 0; + } + current = nx; + } +} + +int skip_list_cursor_seek_for_prev(skip_list_cursor_t *cursor, const uint8_t *key, + const size_t key_size) +{ + if (cursor == NULL || key == NULL || key_size == 0) return -1; + + skip_list_node_t *current = cursor->cached_header; + const int max_level = + atomic_load_explicit(&cursor->list->level, memory_order_acquire); /* cache level */ + const skip_list_cmp_type_t cmp_type = cursor->list->cmp_type; + + /* we find the last node with key <= target */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, cursor->list, next->key, + next->key_size, key, key_size); + if (cmp > 0) break; /* stop when key > target */ + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + /* the current is now the last node with key <= target, or header if no such key */ + if (NODE_IS_SENTINEL(current)) + { + /* no key <= target exists, cursor is invalid */ + cursor->current = current; + cursor->current_version = NULL; + return 0; + } + + cursor->current = current; + cursor->current_version = NULL; + return 0; +} + +int skip_list_put_with_seq(skip_list_t *list, const uint8_t *key, size_t key_size, + const uint8_t *value, size_t value_size, int64_t ttl, uint64_t seq, + uint8_t flags) +{ + const int is_tombstone = (flags & SKIP_LIST_FLAG_DELETED) != 0; + if (list == NULL || key == NULL || key_size == 0 || (!is_tombstone && value == NULL)) return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + /* we use stack allocation for update array (SKIP_LIST_STACK_UPDATE_SIZE is file-scope) */ + skip_list_node_t *stack_update[SKIP_LIST_STACK_UPDATE_SIZE]; + skip_list_node_t **update; + const int use_stack = (list->max_level < SKIP_LIST_STACK_UPDATE_SIZE); + + if (use_stack) + { + update = stack_update; + } + else + { + update = malloc((list->max_level + 1) * sizeof(skip_list_node_t *)); + if (!update) return -1; + } + + for (int i = 0; i <= list->max_level; i++) + { + update[i] = header; + } + + skip_list_node_t *current = header; + + /* we traverse with prefetching -- prefetch before sentinel check */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, next->key_size, + key, key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + update[i] = current; + } + + skip_list_node_t *existing = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (existing != NULL && !NODE_IS_SENTINEL(existing)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, existing->key, + existing->key_size, key, key_size); + if (cmp == 0) + { + /* the key exists, we validate sequence and add new version */ + skip_list_version_t *latest = + atomic_load_explicit(&existing->versions, memory_order_acquire); + if (skip_list_validate_sequence(latest, seq) != 0) + { + if (!use_stack) free(update); + return -1; + } + + skip_list_version_t *new_version = + skip_list_create_version(list, value, value_size, ttl, flags, seq); + if (new_version == NULL) + { + if (!use_stack) free(update); + return -1; + } + + if (skip_list_insert_version_cas(&existing->versions, new_version, seq, list, + value_size) != 0) + { + if (!use_stack) free(update); + return -1; + } + + if (!use_stack) free(update); + return 0; + } + } + + skip_list_node_t *recheck = atomic_load_explicit(&update[0]->forward[0], memory_order_acquire); + if (recheck != existing && recheck != NULL && !NODE_IS_SENTINEL(recheck)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, recheck->key, recheck->key_size, + key, key_size); + if (cmp == 0) + { + skip_list_version_t *latest = + atomic_load_explicit(&recheck->versions, memory_order_acquire); + if (skip_list_validate_sequence(latest, seq) != 0) + { + if (!use_stack) free(update); + return -1; + } + + skip_list_version_t *new_version = + skip_list_create_version(list, value, value_size, ttl, flags, seq); + if (new_version == NULL) + { + if (!use_stack) free(update); + return -1; + } + + if (skip_list_insert_version_cas(&recheck->versions, new_version, seq, list, + value_size) != 0) + { + if (!use_stack) free(update); + return -1; + } + + if (!use_stack) free(update); + return 0; + } + } + + int new_level = skip_list_random_level(list); + int current_level = atomic_load_explicit(&list->level, memory_order_acquire); + + if (new_level > current_level) + { + for (int i = current_level + 1; i <= new_level; i++) + { + update[i] = header; + } + atomic_store_explicit(&list->level, new_level, memory_order_release); + } + + /* we combine node + pointers + key into single allocation for cache locality */ + const size_t pointers_size = (2 * (new_level + 1)) * sizeof(_Atomic(skip_list_node_t *)); + skip_list_node_t *new_node = + skip_list_alloc(list, sizeof(skip_list_node_t) + pointers_size + key_size); + if (new_node == NULL) + { + if (!use_stack) free(update); + return -1; + } + + new_node->key = (uint8_t *)new_node + sizeof(skip_list_node_t) + pointers_size; + memcpy(new_node->key, key, key_size); + new_node->key_size = key_size; + new_node->level = (uint8_t)new_level; + new_node->node_flags = 0; + + skip_list_version_t *initial_version = + skip_list_create_version(list, value, value_size, ttl, flags, seq); + if (initial_version == NULL) + { + skip_list_dealloc(list, new_node); + if (!use_stack) free(update); + return -1; + } + atomic_init(&new_node->versions, initial_version); + + for (int i = 0; i <= new_level; i++) + { + atomic_init(&new_node->forward[i], NULL); + atomic_init(&BACKWARD_PTR(new_node, i, new_level), NULL); + } + + skip_list_node_t *pred = update[0]; + skip_list_node_t *next_at_0; + int cas_attempts = 0; + + while (1) + { + next_at_0 = atomic_load_explicit(&pred->forward[0], memory_order_acquire); + + if (next_at_0 != NULL && !NODE_IS_SENTINEL(next_at_0)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next_at_0->key, + next_at_0->key_size, key, key_size); + if (cmp == 0) + { + skip_list_version_t *latest = + atomic_load_explicit(&next_at_0->versions, memory_order_acquire); + if (skip_list_validate_sequence(latest, seq) != 0) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + + skip_list_version_t *new_version = + skip_list_create_version(list, value, value_size, ttl, flags, seq); + if (new_version == NULL) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + + if (skip_list_insert_version_cas(&next_at_0->versions, new_version, seq, list, + value_size) != 0) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return 0; + } + if (cmp < 0) + { + pred = next_at_0; + continue; + } + } + + atomic_store_explicit(&new_node->forward[0], next_at_0, memory_order_relaxed); + if (atomic_compare_exchange_weak_explicit(&pred->forward[0], &next_at_0, new_node, + memory_order_release, memory_order_acquire)) + { + update[0] = pred; + break; + } + + if (next_at_0 != NULL && !NODE_IS_SENTINEL(next_at_0)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next_at_0->key, + next_at_0->key_size, key, key_size); + if (cmp == 0) + { + skip_list_version_t *latest = + atomic_load_explicit(&next_at_0->versions, memory_order_acquire); + if (skip_list_validate_sequence(latest, seq) != 0) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + + skip_list_version_t *new_version = + skip_list_create_version(list, value, value_size, ttl, flags, seq); + if (new_version == NULL) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + + if (skip_list_insert_version_cas(&next_at_0->versions, new_version, seq, list, + value_size) != 0) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return 0; + } + if (cmp < 0) + { + pred = next_at_0; + continue; + } + } + + cas_attempts++; + if (cas_attempts > SKIP_LIST_MAX_CAS_ATTEMPTS) + { + skip_list_free_node_internal(list, new_node); + if (!use_stack) free(update); + return -1; + } + } + + atomic_store_explicit(&BACKWARD_PTR(new_node, 0, new_level), update[0], memory_order_release); + skip_list_node_t *next_after_insert = + atomic_load_explicit(&new_node->forward[0], memory_order_acquire); + if (next_after_insert != NULL) + { + skip_list_node_t *expected = update[0]; + atomic_compare_exchange_strong_explicit( + &BACKWARD_PTR(next_after_insert, 0, next_after_insert->level), &expected, new_node, + memory_order_release, memory_order_acquire); + } + + for (int i = 1; i <= new_level; i++) + { + skip_list_node_t *next; + do + { + next = atomic_load_explicit(&update[i]->forward[i], memory_order_acquire); + atomic_store_explicit(&new_node->forward[i], next, memory_order_relaxed); + } while (!atomic_compare_exchange_weak_explicit( + &update[i]->forward[i], &next, new_node, memory_order_release, memory_order_acquire)); + + atomic_store_explicit(&BACKWARD_PTR(new_node, i, new_level), update[i], + memory_order_release); + if (next != NULL) + { + skip_list_node_t *expected = update[i]; + atomic_compare_exchange_strong_explicit(&BACKWARD_PTR(next, i, next->level), &expected, + new_node, memory_order_release, + memory_order_acquire); + } + } + + atomic_fetch_add_explicit(&list->total_size, key_size + value_size, memory_order_relaxed); + atomic_fetch_add_explicit(&list->entry_count, 1, memory_order_relaxed); + + if (!use_stack) free(update); + return 0; +} + +int skip_list_put_batch(skip_list_t *list, const skip_list_batch_entry_t *entries, + const size_t count) +{ + if (list == NULL || entries == NULL || count == 0) return -1; + + int success_count = 0; + + /* we use a shared update array across batch entries for efficiency + * this avoids repeated allocation/deallocation per entry */ + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + + skip_list_node_t *stack_update[SKIP_LIST_STACK_UPDATE_SIZE]; + skip_list_node_t **update; + const int use_stack = (list->max_level < SKIP_LIST_STACK_UPDATE_SIZE); + + if (use_stack) + { + update = stack_update; + } + else + { + update = malloc((list->max_level + 1) * sizeof(skip_list_node_t *)); + if (!update) return -1; + } + + const skip_list_cmp_type_t cmp_type = list->cmp_type; + const uint8_t *prev_key = NULL; + size_t prev_key_size = 0; + int prev_max_level = 0; + size_t batch_total_size = 0; + int batch_entry_count = 0; + + /* we initialize update array for the first entry */ + for (int i = 0; i <= list->max_level; i++) + { + update[i] = header; + } + + for (size_t e = 0; e < count; e++) + { + const skip_list_batch_entry_t *entry = &entries[e]; + + if (entry->key == NULL || entry->key_size == 0) continue; + if (!(entry->flags & SKIP_LIST_FLAG_DELETED) && entry->value == NULL) continue; + + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + + /* sorted-key hint -- if this key >= previous key, reuse update[] positions + * from previous iteration instead of restarting from header. + * each update[i] has level >= i (set during traversal at that level) + * so accessing update[i]->forward[i] is always safe. */ + int use_hint = 0; + if (prev_key != NULL) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, entry->key, entry->key_size, + prev_key, prev_key_size); + use_hint = (cmp >= 0); + } + + skip_list_node_t *current; + if (!use_hint) + { + /* unsorted or first entry -- we reset to header */ + for (int i = 0; i <= list->max_level; i++) + { + update[i] = header; + } + current = header; + } + else + { + /* we init any new levels above prev_max_level to header */ + for (int i = prev_max_level + 1; i <= max_level; i++) + { + update[i] = header; + } + /* we start from the top-level hint, carry-down handles lower levels */ + current = update[max_level]; + } + + /* we traverse with prefetching -- prefetch before sentinel check */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = + atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type( + cmp_type, list, next->key, next->key_size, entry->key, entry->key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + update[i] = current; + } + + prev_key = entry->key; + prev_key_size = entry->key_size; + prev_max_level = max_level; + + /* we check if key exists */ + skip_list_node_t *existing = + atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (existing != NULL && !NODE_IS_SENTINEL(existing)) + { + int cmp = skip_list_compare_keys_with_type( + cmp_type, list, existing->key, existing->key_size, entry->key, entry->key_size); + if (cmp == 0) + { + /* key exists, we add new version */ + skip_list_version_t *latest = + atomic_load_explicit(&existing->versions, memory_order_acquire); + if (skip_list_validate_sequence(latest, entry->seq) != 0) + { + continue; /* skip this entry */ + } + + skip_list_version_t *new_version = skip_list_create_version( + list, entry->value, entry->value_size, entry->ttl, entry->flags, entry->seq); + if (new_version == NULL) + { + continue; + } + + if (skip_list_insert_version_cas(&existing->versions, new_version, entry->seq, list, + entry->value_size) == 0) + { + success_count++; + } + continue; + } + } + + /* we create new node */ + int new_level = skip_list_random_level(list); + int current_level = atomic_load_explicit(&list->level, memory_order_acquire); + + if (new_level > current_level) + { + for (int i = current_level + 1; i <= new_level; i++) + { + update[i] = header; + } + atomic_store_explicit(&list->level, new_level, memory_order_release); + } + + /* we combine node + pointers + key into single allocation for cache locality */ + const size_t batch_ptrs_size = (2 * (new_level + 1)) * sizeof(_Atomic(skip_list_node_t *)); + skip_list_node_t *new_node = + skip_list_alloc(list, sizeof(skip_list_node_t) + batch_ptrs_size + entry->key_size); + if (new_node == NULL) + { + continue; + } + + new_node->key = (uint8_t *)new_node + sizeof(skip_list_node_t) + batch_ptrs_size; + memcpy(new_node->key, entry->key, entry->key_size); + new_node->key_size = entry->key_size; + new_node->level = (uint8_t)new_level; + new_node->node_flags = 0; + + skip_list_version_t *initial_version = skip_list_create_version( + list, entry->value, entry->value_size, entry->ttl, entry->flags, entry->seq); + if (initial_version == NULL) + { + skip_list_dealloc(list, new_node); + continue; + } + atomic_init(&new_node->versions, initial_version); + + for (int i = 0; i <= new_level; i++) + { + atomic_init(&new_node->forward[i], NULL); + atomic_init(&BACKWARD_PTR(new_node, i, new_level), NULL); + } + + /* we insert at level 0 with CAS */ + skip_list_node_t *pred = update[0]; + skip_list_node_t *next_at_0; + int cas_attempts = 0; + int inserted = 0; + + while (1) + { + next_at_0 = atomic_load_explicit(&pred->forward[0], memory_order_acquire); + + if (next_at_0 != NULL && !NODE_IS_SENTINEL(next_at_0)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next_at_0->key, + next_at_0->key_size, entry->key, + entry->key_size); + if (cmp == 0) + { + /* concurrent insert, we add version instead */ + skip_list_version_t *latest = + atomic_load_explicit(&next_at_0->versions, memory_order_acquire); + if (skip_list_validate_sequence(latest, entry->seq) == 0) + { + skip_list_version_t *new_version = + skip_list_create_version(list, entry->value, entry->value_size, + entry->ttl, entry->flags, entry->seq); + if (new_version != NULL) + { + if (skip_list_insert_version_cas(&next_at_0->versions, new_version, + entry->seq, list, + entry->value_size) == 0) + { + success_count++; + } + } + } + skip_list_free_node_internal(list, new_node); + new_node = NULL; /* prevent use-after-free in higher level linking */ + inserted = 1; + break; + } + if (cmp < 0) + { + pred = next_at_0; + continue; + } + } + + atomic_store_explicit(&new_node->forward[0], next_at_0, memory_order_relaxed); + if (atomic_compare_exchange_weak_explicit(&pred->forward[0], &next_at_0, new_node, + memory_order_release, memory_order_acquire)) + { + update[0] = pred; + inserted = 1; + break; + } + + cas_attempts++; + if (cas_attempts > SKIP_LIST_MAX_CAS_ATTEMPTS) + { + skip_list_free_node_internal(list, new_node); + new_node = NULL; /* prevent use-after-free in higher level linking */ + inserted = 1; /* mark as handled to avoid double-free */ + break; + } + } + + if (!inserted) + { + skip_list_free_node_internal(list, new_node); + continue; + } + + if (new_node != NULL && cas_attempts <= SKIP_LIST_MAX_CAS_ATTEMPTS && update[0] == pred) + { + /* we successfully inserted new node, link higher levels */ + atomic_store_explicit(&BACKWARD_PTR(new_node, 0, new_level), update[0], + memory_order_release); + skip_list_node_t *next_after_insert = + atomic_load_explicit(&new_node->forward[0], memory_order_acquire); + if (next_after_insert != NULL) + { + skip_list_node_t *expected = update[0]; + atomic_compare_exchange_strong_explicit( + &BACKWARD_PTR(next_after_insert, 0, next_after_insert->level), &expected, + new_node, memory_order_release, memory_order_acquire); + } + + for (int i = 1; i <= new_level; i++) + { + skip_list_node_t *next; + do + { + next = atomic_load_explicit(&update[i]->forward[i], memory_order_acquire); + atomic_store_explicit(&new_node->forward[i], next, memory_order_relaxed); + } while (!atomic_compare_exchange_weak_explicit(&update[i]->forward[i], &next, + new_node, memory_order_release, + memory_order_acquire)); + + atomic_store_explicit(&BACKWARD_PTR(new_node, i, new_level), update[i], + memory_order_release); + if (next != NULL) + { + skip_list_node_t *expected = update[i]; + atomic_compare_exchange_strong_explicit( + &BACKWARD_PTR(next, i, next->level), &expected, new_node, + memory_order_release, memory_order_acquire); + } + } + + batch_total_size += entry->key_size + entry->value_size; + batch_entry_count++; + success_count++; + } + } + + /* we do a single atomic update for the entire batch instead of per-entry */ + if (batch_total_size > 0) + atomic_fetch_add_explicit(&list->total_size, batch_total_size, memory_order_relaxed); + if (batch_entry_count > 0) + atomic_fetch_add_explicit(&list->entry_count, batch_entry_count, memory_order_relaxed); + + if (!use_stack) free(update); + return success_count; +} + +int skip_list_get_max_seq(skip_list_t *list, const uint8_t *key, const size_t key_size, + uint64_t *out_seq) +{ + if (list == NULL || key == NULL || key_size == 0 || out_seq == NULL) return -1; + + *out_seq = 0; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *current = header; + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, next->key_size, + key, key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + skip_list_node_t *target = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (target == NULL || NODE_IS_SENTINEL(target)) return -1; + + int cmp = skip_list_compare_keys_with_type(cmp_type, list, target->key, target->key_size, key, + key_size); + if (cmp != 0) return -1; + + skip_list_version_t *version = atomic_load_explicit(&target->versions, memory_order_acquire); + if (version == NULL) return -1; + + *out_seq = atomic_load_explicit(&version->seq, memory_order_acquire); + return 0; +} + +int skip_list_get_with_seq(skip_list_t *list, const uint8_t *key, const size_t key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, uint8_t *deleted, + uint64_t *seq, uint64_t snapshot_seq, + const skip_list_visibility_check_fn visibility_check, + void *visibility_ctx) +{ + if (list == NULL || key == NULL || key_size == 0 || value == NULL || value_size == NULL) + return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *current = header; + const int max_level = + atomic_load_explicit(&list->level, memory_order_acquire); /* cache level */ + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + /* we attempt to find the node */ + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, next->key_size, + key, key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + skip_list_node_t *target = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (target == NULL || NODE_IS_SENTINEL(target)) return -1; + + int cmp = skip_list_compare_keys_with_type(cmp_type, list, target->key, target->key_size, key, + key_size); + if (cmp != 0) return -1; + + /* we found the key, now we must find the appropriate version */ + skip_list_version_t *version = atomic_load_explicit(&target->versions, memory_order_acquire); + + if (snapshot_seq == UINT64_MAX) + { + if (version == NULL) return -1; + } + else + { + /** + * we find the newest committed version with seq <= snapshot_seq. + * version chain is ordered newest-to-oldest, so we return the first + * version that passes both checks. */ + while (version != NULL) + { + uint64_t version_seq = atomic_load_explicit(&version->seq, memory_order_acquire); + + /* we check if version is within snapshot range */ + if (version_seq <= snapshot_seq) + { + /* if visibility check provided, we verify this version is committed */ + if (visibility_check != NULL) + { + if (visibility_check(visibility_ctx, version_seq)) + { + /* we found the newest committed version within snapshot -- we use it */ + break; + } + /* this version is not committed yet -- thus we check older versions */ + } + else + { + /* no visibility check -- we assume committed (for recovery, etc.) */ + break; + } + } + /* version is too new or not committed -- we check next (older) version */ + version = atomic_load_explicit(&version->next, memory_order_acquire); + } + + if (version == NULL) return -1; /* no visible version */ + } + + /* we always set ttl if provided */ + if (ttl != NULL) *ttl = version->ttl; + + if (version->ttl > 0) + { + if (version->ttl <= skip_list_get_current_time(list)) + { + if (deleted != NULL) *deleted = 1; + *value = NULL; + *value_size = 0; + if (seq != NULL) *seq = atomic_load_explicit(&version->seq, memory_order_acquire); + return 0; /* return success but mark as expired/deleted */ + } + } + + uint8_t is_deleted = VERSION_IS_DELETED(version); + if (deleted != NULL) *deleted = is_deleted; + + /* we return the value (even for tombstones, caller checks deleted flag) */ + if (!is_deleted && version->value != NULL && version->value_size > 0) + { + *value = malloc(version->value_size); + if (*value == NULL) return -1; + memcpy(*value, version->value, version->value_size); + *value_size = version->value_size; + } + else + { + *value = NULL; + *value_size = 0; + } + + if (seq != NULL) *seq = atomic_load_explicit(&version->seq, memory_order_acquire); + + return 0; +} + +int skip_list_get_with_seq_ref(skip_list_t *list, const uint8_t *key, const size_t key_size, + const uint8_t **value, size_t *value_size, int64_t *ttl, + uint8_t *deleted, uint64_t *seq, uint64_t snapshot_seq, + const skip_list_visibility_check_fn visibility_check, + void *visibility_ctx) +{ + if (list == NULL || key == NULL || key_size == 0 || value == NULL || value_size == NULL) + return -1; + + skip_list_node_t *header = atomic_load_explicit(&list->header, memory_order_acquire); + skip_list_node_t *current = header; + const int max_level = atomic_load_explicit(&list->level, memory_order_acquire); + const skip_list_cmp_type_t cmp_type = list->cmp_type; + + for (int i = max_level; i >= 0; i--) + { + skip_list_node_t *next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + + while (next != NULL && !NODE_IS_SENTINEL(next)) + { + int cmp = skip_list_compare_keys_with_type(cmp_type, list, next->key, next->key_size, + key, key_size); + if (cmp >= 0) break; + current = next; + next = atomic_load_explicit(¤t->forward[i], memory_order_acquire); + if (SKIP_LIST_LIKELY(next != NULL)) + { + PREFETCH_READ(next); + PREFETCH_READ(next->key); + } + } + } + + skip_list_node_t *target = atomic_load_explicit(¤t->forward[0], memory_order_acquire); + if (target == NULL || NODE_IS_SENTINEL(target)) return -1; + + int cmp = skip_list_compare_keys_with_type(cmp_type, list, target->key, target->key_size, key, + key_size); + if (cmp != 0) return -1; + + skip_list_version_t *version = atomic_load_explicit(&target->versions, memory_order_acquire); + + if (snapshot_seq == UINT64_MAX) + { + if (version == NULL) return -1; + } + else + { + while (version != NULL) + { + uint64_t version_seq = atomic_load_explicit(&version->seq, memory_order_acquire); + + if (version_seq <= snapshot_seq) + { + if (visibility_check != NULL) + { + if (visibility_check(visibility_ctx, version_seq)) + { + break; + } + } + else + { + break; + } + } + version = atomic_load_explicit(&version->next, memory_order_acquire); + } + + if (version == NULL) return -1; + } + + if (ttl != NULL) *ttl = version->ttl; + + if (version->ttl > 0) + { + if (version->ttl <= skip_list_get_current_time(list)) + { + if (deleted != NULL) *deleted = 1; + *value = NULL; + *value_size = 0; + if (seq != NULL) *seq = atomic_load_explicit(&version->seq, memory_order_acquire); + return 0; + } + } + + uint8_t is_deleted = VERSION_IS_DELETED(version); + if (deleted != NULL) *deleted = is_deleted; + + if (!is_deleted && version->value != NULL && version->value_size > 0) + { + *value = version->value; + *value_size = version->value_size; + } + else + { + *value = NULL; + *value_size = 0; + } + + if (seq != NULL) *seq = atomic_load_explicit(&version->seq, memory_order_acquire); + + return 0; +} \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/skip_list.h b/storage/tidesdb/libtidesdb/src/skip_list.h new file mode 100644 index 0000000000000..df27b8b11151a --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/skip_list.h @@ -0,0 +1,789 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __SKIP_LIST_H__ +#define __SKIP_LIST_H__ +#include "compat.h" + +/* branch prediction hints for hot paths */ +#if defined(__GNUC__) || defined(__clang__) +#define SKIP_LIST_LIKELY(x) __builtin_expect(!!(x), 1) +#define SKIP_LIST_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define SKIP_LIST_LIKELY(x) (x) +#define SKIP_LIST_UNLIKELY(x) (x) +#endif + +/* forward declarations */ +typedef struct skip_list_node_t skip_list_node_t; +typedef struct skip_list_t skip_list_t; +typedef struct skip_list_version_t skip_list_version_t; +typedef struct skip_list_arena_block_t skip_list_arena_block_t; +typedef struct skip_list_arena_t skip_list_arena_t; + +/* arena alignment for all allocations */ +#define SKIP_LIST_ARENA_ALIGNMENT 8 + +/* maximum number of thread-local blocks for contention-free allocation */ +#define SKIP_LIST_ARENA_MAX_THREADS 64 + +/* default size for thread-local blocks (smaller than shared block to save memory) */ +#define SKIP_LIST_ARENA_TL_BLOCK_SIZE (64 * 1024) + +/** + * skip_list_arena_block_t + * a single contiguous memory block in the arena's linked list + * @param data pointer to the raw memory + * @param used atomic bump pointer (bytes consumed so far) + * @param capacity total bytes available in this block + * @param prev previous block in the chain (for destruction) + */ +struct skip_list_arena_block_t +{ + uint8_t *data; + _Atomic(size_t) used; + size_t capacity; + skip_list_arena_block_t *prev; +}; + +/** + * skip_list_arena_t + * lock-free bump allocator for skip list nodes and versions + * uses thread-local blocks to eliminate atomic contention on the fast path + * each thread gets its own block; only block allocation requires synchronization + * individual frees are no-ops; all memory is reclaimed when the arena is destroyed + * @param current_block atomic pointer to the shared fallback block (rarely used) + * @param block_size default capacity for new blocks + * @param tl_blocks thread-local block pointers indexed by thread slot + * @param tl_slot_counter atomic counter for assigning thread slots + * @param all_blocks_head atomic linked list of all blocks for destruction + */ +struct skip_list_arena_t +{ + _Atomic(skip_list_arena_block_t *) current_block; + size_t block_size; + _Atomic(skip_list_arena_block_t *) tl_blocks[SKIP_LIST_ARENA_MAX_THREADS]; + _Atomic(int) tl_slot_counter; + _Atomic(skip_list_arena_block_t *) all_blocks_head; +}; + +/* skip_list_version_t flag bits */ +#define SKIP_LIST_FLAG_DELETED 0x01 /* version is tombstone */ +#define SKIP_LIST_FLAG_SINGLE_DELETE \ + 0x02 /* tombstone subtype, always set together with \ + * SKIP_LIST_FLAG_DELETED. caller promises the key \ + * has been put at most once since the last \ + * single-delete or start, so put+single-delete can \ + * be reaped together at compaction. */ + +/** + * skip_list_cmp_type_t + * comparator type enum + */ +typedef enum +{ + SKIP_LIST_CMP_MEMCMP = 0, /* default memcmp-based comparison */ + SKIP_LIST_CMP_STRING, /* string-based comparison */ + SKIP_LIST_CMP_NUMERIC, /* numeric comparison (8-byte keys) */ + SKIP_LIST_CMP_CUSTOM /* custom comparator function */ +} skip_list_cmp_type_t; + +/* skip_list_node_t flag bits */ +#define SKIP_LIST_NODE_FLAG_SENTINEL 0x01 /* node is a sentinel (header or tail) */ + +#define SKIP_LIST_MAX_CAS_ATTEMPTS 1000 + +/* helper macros for flag access */ +#define VERSION_IS_DELETED(version) \ + (atomic_load_explicit(&(version)->flags, memory_order_acquire) & SKIP_LIST_FLAG_DELETED) + +#define NODE_IS_SENTINEL(node) ((node)->node_flags & SKIP_LIST_NODE_FLAG_SENTINEL) + +/** + * skip_list_version_t + * a single version of a key's value + * @param seq sequence number for MVCC (monotonically increasing) + * @param value value data + * @param value_size size of value + * @param ttl time-to-live + * @param next next older version + * @param flags version flags (deleted, etc) + */ +struct skip_list_version_t +{ + _Atomic(uint64_t) seq; + uint8_t *value; + size_t value_size; + int64_t ttl; + _Atomic(skip_list_version_t *) next; + _Atomic(uint8_t) flags; +}; + +/** + * skip_list_comparator_fn + * comparator function type for custom key comparison + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +typedef int (*skip_list_comparator_fn)(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/* macro to access backward pointers at a specific level */ +#define BACKWARD_PTR(node, lvl, max_level) (node->forward[(max_level) + 1 + (lvl)]) + +/** + * skip_list_node_t + * a key in the skip list with multiple versions + * @param level node level in skip list + * @param node_flags node flags (sentinel, etc) + * @param key key data (NULL for sentinel nodes) + * @param key_size size of key (0 for sentinel nodes) + * @param versions lock-free list of versions (newest first) + * @param forward forward[0..level] forward pointers, forward[level+1..2*level+1] backward pointers + */ +struct skip_list_node_t +{ + uint8_t level; + uint8_t node_flags; + uint8_t *key; + size_t key_size; + _Atomic(skip_list_version_t *) versions; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4200) +#endif + _Atomic(skip_list_node_t *) forward[]; +#ifdef _MSC_VER +#pragma warning(pop) +#endif +}; + +/** + * skip_list_t + * main skip list structure + * @param level current maximum level + * @param max_level maximum allowed level + * @param probability probability for level generation + * @param header sentinel header node (compares less than all keys) + * @param tail sentinel tail node (compares greater than all keys) + * @param total_size total size of all entries + * @param entry_count track entry count atomically to avoid O(n) traversals + * @param cmp_type comparator type enum (memcmp, string, numeric, custom) + * @param comparator key comparison function + * @param comparator_ctx context for comparator + * @param cached_time pointer to external cached time (NULL = use time(NULL)) + * @param arena bump allocator for cache-friendly node allocation (NULL = use malloc/free) + */ +typedef struct skip_list_t +{ + _Atomic(int) level; + int max_level; + float probability; + _Atomic(skip_list_node_t *) header; + _Atomic(skip_list_node_t *) tail; + _Atomic(size_t) total_size; + _Atomic(int) entry_count; + skip_list_cmp_type_t cmp_type; + skip_list_comparator_fn comparator; + void *comparator_ctx; + _Atomic(time_t) *cached_time; + skip_list_arena_t *arena; +} skip_list_t; + +/** + * skip_list_cursor_t + * cursor structure for iterating through the skip list + * @param list pointer to the skip list + * @param current current node position + * @param cached_header cached header sentinel for fast boundary checks + * @param cached_tail cached tail sentinel for fast boundary checks + * @param current_version current version on the current node; NULL means use head. + * advanced by skip_list_cursor_advance_in_node and reset on + * every cursor seek/next/prev + */ +typedef struct +{ + skip_list_t *list; + skip_list_node_t *current; + skip_list_node_t *cached_header; + skip_list_node_t *cached_tail; + skip_list_version_t *current_version; +} skip_list_cursor_t; + +/** + * skip_list_comparator_memcmp + * default memcmp-based comparator + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer (unused) + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int skip_list_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * skip_list_comparator_string + * string-based comparator + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer (unused) + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int skip_list_comparator_string(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * skip_list_comparator_numeric + * numeric comparator + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx context pointer (unused) + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int skip_list_comparator_numeric(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * skip_list_create_node + * creates a new skip list node + * @param level level of the node + * @param key key data + * @param key_size size of key + * @param value value data + * @param value_size size of value + * @param ttl time-to-live + * @param flags version flags (bitmask of SKIP_LIST_FLAG_*) + * @return pointer to new node, NULL on failure + */ +skip_list_node_t *skip_list_create_node(int level, const uint8_t *key, size_t key_size, + const uint8_t *value, size_t value_size, int64_t ttl, + uint8_t flags); + +/** + * skip_list_free_node + * frees a skip list node + * @param node node to free + * @return 0 on success, -1 on failure + */ +int skip_list_free_node(skip_list_node_t *node); + +/** + * skip_list_new + * creates a new skip list with default memcmp comparator + * @param list pointer to skip list pointer + * @param max_level maximum level + * @param probability probability for level generation + * @return 0 on success, -1 on failure + */ +int skip_list_new(skip_list_t **list, int max_level, float probability); + +/** + * skip_list_new_with_comparator + * creates a new skip list with custom comparator + * @param list pointer to skip list pointer + * @param max_level maximum level + * @param probability probability for level generation + * @param comparator custom key comparison function + * @param comparator_ctx context for comparator + * @return 0 on success, -1 on failure + */ +int skip_list_new_with_comparator(skip_list_t **list, int max_level, float probability, + skip_list_comparator_fn comparator, void *comparator_ctx); + +/** + * skip_list_new_with_comparator_and_cached_time + * creates a new skip list with custom comparator and cached time pointer + * @param list pointer to skip list pointer + * @param max_level maximum level + * @param probability probability for level generation + * @param comparator custom key comparison function + * @param comparator_ctx context for comparator + * @param cached_time pointer to external cached time (avoids time() syscalls) + * @return 0 on success, -1 on failure + */ +int skip_list_new_with_comparator_and_cached_time(skip_list_t **list, int max_level, + float probability, + skip_list_comparator_fn comparator, + void *comparator_ctx, + _Atomic(time_t) *cached_time); + +/** + * skip_list_new_with_arena + * creates a new skip list backed by a bump arena for cache-friendly node allocation + * all node and version memory is allocated from contiguous blocks, improving spatial + * locality during traversal. individual frees are no-ops; memory is reclaimed when + * the skip list is freed. ideal for memtable skip lists that are filled then freed whole. + * @param list pointer to skip list pointer + * @param max_level maximum level + * @param probability probability for level generation + * @param comparator custom key comparison function + * @param comparator_ctx context for comparator + * @param cached_time pointer to external cached time (avoids time() syscalls) + * @param arena_initial_capacity initial arena block size in bytes (0 = no arena) + * @return 0 on success, -1 on failure + */ +int skip_list_new_with_arena(skip_list_t **list, int max_level, float probability, + skip_list_comparator_fn comparator, void *comparator_ctx, + _Atomic(time_t) *cached_time, size_t arena_initial_capacity); + +/** + * skip_list_random_level + * generates a random level for a new node + * @param list skip list + * @return random level + */ +int skip_list_random_level(const skip_list_t *list); + +/** + * skip_list_compare_keys + * compares two keys using the skip list's comparator + * @param list skip list + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @return negative if key1 < key2, 0 if equal, positive if key1 > key2 + */ +int skip_list_compare_keys(const skip_list_t *list, const uint8_t *key1, size_t key1_size, + const uint8_t *key2, size_t key2_size); + +/** + * skip_list_put_with_seq + * inserts or updates a key-value pair with a specific sequence number + * @param list skip list + * @param key key + * @param key_size key size + * @param value value + * @param value_size value size + * @param ttl time-to-live + * @param seq sequence number for MVCC + * @param flags bitmask of SKIP_LIST_FLAG_*; 0 means a live put, SKIP_LIST_FLAG_DELETED + * means a tombstone, optionally OR'd with SKIP_LIST_FLAG_SINGLE_DELETE. + * passing 1 for a regular tombstone remains valid because the value 1 + * equals SKIP_LIST_FLAG_DELETED. + * @return 0 on success, -1 on failure + */ +int skip_list_put_with_seq(skip_list_t *list, const uint8_t *key, size_t key_size, + const uint8_t *value, size_t value_size, int64_t ttl, uint64_t seq, + uint8_t flags); + +/** + * skip_list_delete + * deletes a key (creates tombstone) with a specific sequence number + * @param list skip list + * @param key key data + * @param key_size size of key + * @param seq sequence number for the deletion (must be greater than existing versions) + * @return 0 on success, -1 on failure (including if seq <= existing version seq) + */ +int skip_list_delete(skip_list_t *list, const uint8_t *key, size_t key_size, uint64_t seq); + +/** + * skip_list_batch_entry_t + * entry for batch put operations + * + * flags is a bitmask of SKIP_LIST_FLAG_*. a live put leaves flags = 0; a regular + * tombstone sets SKIP_LIST_FLAG_DELETED; a single-delete tombstone also sets + * SKIP_LIST_FLAG_SINGLE_DELETE on top. callers that previously set deleted = 1 + * continue to work unchanged because the value 1 equals SKIP_LIST_FLAG_DELETED. + */ +typedef struct +{ + const uint8_t *key; + size_t key_size; + const uint8_t *value; + size_t value_size; + uint64_t seq; + int64_t ttl; + uint8_t flags; +} skip_list_batch_entry_t; + +/** + * skip_list_put_batch + * inserts multiple key-value pairs in a batch for better performance + * entries should ideally be sorted by key for optimal performance + * @param list skip list + * @param entries array of batch entries + * @param count number of entries + * @return number of successfully inserted entries; this MAY be less than count when + * individual entries are skipped (e.g. duplicate (key,seq) or a per-entry + * allocation failure) -- compare the result against count to detect a partial + * batch. returns -1 only on a critical failure that inserts nothing, NULL list/ + * entries, count == 0, or the update-array allocation failing. + */ +int skip_list_put_batch(skip_list_t *list, const skip_list_batch_entry_t *entries, size_t count); + +/** + * skip_list_get + * retrieves a value by key + * @param list skip list + * @param key key data + * @param key_size size of key + * @param value pointer to value pointer (caller must free) + * @param value_size pointer to value size + * @param deleted pointer to deleted flag + * @param ttl pointer to ttl + * @return 0 on success, -1 on failure + */ +int skip_list_get(skip_list_t *list, const uint8_t *key, size_t key_size, uint8_t **value, + size_t *value_size, int64_t *ttl, uint8_t *deleted); + +/** + * skip_list_get_ref + * zero-copy get that returns a direct pointer into the version data + * the returned pointers are only valid while the caller holds a reference + * to the skip list (e.g. memtable refcount). caller must not free the value. + * @param list skip list + * @param key key data + * @param key_size size of key + * @param value pointer to value pointer (do not free) + * @param value_size pointer to value size + * @param ttl pointer to ttl + * @param deleted pointer to deleted flag + * @return 0 on success, -1 on failure + */ +int skip_list_get_ref(skip_list_t *list, const uint8_t *key, size_t key_size, const uint8_t **value, + size_t *value_size, int64_t *ttl, uint8_t *deleted); + +/** + * skip_list_visibility_check_fn + * Callback function to check if a sequence is visible + * @param opaque_ctx opaque context pointer (e.g., commit_status) + * @param seq sequence number to check + * @return 1 if visible, 0 if not + */ +typedef int (*skip_list_visibility_check_fn)(void *opaque_ctx, uint64_t seq); + +/** + * skip_list_get_with_seq + * retrieves a value by key with sequence number for MVCC snapshot reads + * @param list skip list + * @param key key data + * @param key_size size of key + * @param value pointer to value pointer (caller must free) + * @param value_size pointer to value size + * @param ttl pointer to ttl + * @param deleted pointer to deleted flag + * @param seq pointer to sequence number (output) + * @param snapshot_seq snapshot sequence number. UINT64_MAX reads the latest version with no + * snapshot filtering; any other value reads the newest version with seq <= + * snapshot_seq, so 0 matches nothing because sequence numbers start at 1 + * @param visibility_check callback to check if a sequence is committed (NULL = skip check) + * @param visibility_ctx context for visibility check callback + * @return 0 on success, -1 on failure + */ +int skip_list_get_with_seq(skip_list_t *list, const uint8_t *key, size_t key_size, uint8_t **value, + size_t *value_size, int64_t *ttl, uint8_t *deleted, uint64_t *seq, + uint64_t snapshot_seq, skip_list_visibility_check_fn visibility_check, + void *visibility_ctx); + +/** + * skip_list_get_with_seq_ref + * zero-copy MVCC get that returns a direct pointer into the version data + * the returned pointer is only valid while the caller holds a reference + * to the skip list (e.g. memtable refcount). caller must not free the value. + * @param list skip list + * @param key key data + * @param key_size size of key + * @param value pointer to const value pointer (do not free) + * @param value_size pointer to value size + * @param ttl pointer to ttl + * @param deleted pointer to deleted flag + * @param seq pointer to sequence number (output) + * @param snapshot_seq snapshot sequence number (UINT64_MAX = latest; otherwise the newest version + * with seq <= snapshot_seq, and 0 matches nothing since seqs start at 1) + * @param visibility_check callback to check if a sequence is committed + * @param visibility_ctx context for visibility check callback + * @return 0 on success, -1 on failure + */ +int skip_list_get_with_seq_ref(skip_list_t *list, const uint8_t *key, size_t key_size, + const uint8_t **value, size_t *value_size, int64_t *ttl, + uint8_t *deleted, uint64_t *seq, uint64_t snapshot_seq, + skip_list_visibility_check_fn visibility_check, + void *visibility_ctx); + +/** + * skip_list_get_max_seq + * retrieves only the maximum sequence number for a key without allocating value + * optimized for conflict detection where only seq comparison is needed + * @param list skip list + * @param key key data + * @param key_size size of key + * @param out_seq output parameter for sequence number (set to 0 if not found) + * @return 0 if key found, -1 if not found or error + */ +int skip_list_get_max_seq(skip_list_t *list, const uint8_t *key, size_t key_size, + uint64_t *out_seq); + +/** + * skip_list_cursor_init + * initializes a new cursor + * @param cursor pointer to cursor pointer + * @param list skip list + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_init(skip_list_cursor_t **cursor, skip_list_t *list); + +/** + * skip_list_cursor_next + * moves cursor to next entry + * @param cursor cursor + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_next(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_prev + * moves cursor to previous entry + * @param cursor cursor + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_prev(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_get + * gets key-value at current cursor position + * @param cursor cursor + * @param key pointer to key pointer + * @param key_size pointer to key size + * @param value pointer to value pointer + * @param value_size pointer to value size + * @param ttl pointer to ttl + * @param deleted pointer to deleted flag + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_get(skip_list_cursor_t *cursor, uint8_t **key, size_t *key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, uint8_t *deleted); + +/** + * skip_list_cursor_next_get + * fused next + get in a single call, avoiding redundant sentinel checks + * and enabling better prefetching. returns zero-copy pointers. + * @param cursor cursor + * @param key pointer to key pointer (do not free) + * @param key_size pointer to key size + * @param value pointer to value pointer (do not free) + * @param value_size pointer to value size + * @param ttl pointer to ttl + * @param deleted pointer to deleted flag + * @return 0 on success, -1 on failure (end of list) + */ +int skip_list_cursor_next_get(skip_list_cursor_t *cursor, uint8_t **key, size_t *key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, uint8_t *deleted); + +/** + * skip_list_cursor_get_with_seq + * get key-value pair at cursor position with sequence number + * @param cursor cursor + * @param key pointer to key + * @param key_size pointer to key size + * @param value pointer to value + * @param value_size pointer to value size + * @param ttl pointer to TTL + * @param deleted pointer to deleted flag + * @param seq pointer to sequence number + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_get_with_seq(skip_list_cursor_t *cursor, uint8_t **key, size_t *key_size, + uint8_t **value, size_t *value_size, int64_t *ttl, + uint8_t *deleted, uint64_t *seq); + +/** + * skip_list_cursor_advance_in_node + * advance the cursor to the next-older version on the current node without moving + * to the next key. used by mvcc readers and flushers that need every version still + * visible to an active snapshot, not just the latest. resets to head on the next + * cursor seek/next/prev. + * @param cursor cursor + * @return 0 on success, -1 when the version chain on the current node is exhausted + */ +int skip_list_cursor_advance_in_node(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_free + * frees a cursor + * @param cursor cursor to free + */ +void skip_list_cursor_free(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_at_start + * checks if cursor is at start + * @param cursor cursor + * @return 1 if at start, 0 if not, -1 on error + */ +int skip_list_cursor_at_start(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_at_end + * checks if cursor is at end + * @param cursor cursor + * @return 1 if at end, 0 if not, -1 on error + */ +int skip_list_cursor_at_end(const skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_has_next + * checks if cursor has next entry + * @param cursor cursor + * @return 1 if has next, 0 if not + */ +int skip_list_cursor_has_next(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_has_prev + * checks if cursor has previous entry + * @param cursor cursor + * @return 1 if has prev, 0 if not + */ +int skip_list_cursor_has_prev(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_goto_last + * moves cursor to last entry + * @param cursor cursor + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_goto_last(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_goto_first + * moves cursor to first entry + * @param cursor cursor + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_goto_first(skip_list_cursor_t *cursor); + +/** + * skip_list_cursor_seek + * positions cursor at the node before the first key >= target + * @param cursor cursor to position + * @param key target key + * @param key_size size of target key + * @return 0 on success, -1 on failure + * + * after calling this function, cursor->current points to the predecessor node. + * callers must call skip_list_cursor_next() to access the actual first key >= target. + * this behavior allows efficient insertion and supports both exact matches and range queries. + */ +int skip_list_cursor_seek(skip_list_cursor_t *cursor, const uint8_t *key, size_t key_size); + +/** + * skip_list_cursor_seek_ge + * seeks cursor directly to the first key >= target, positioning cursor->current on it. + * unlike skip_list_cursor_seek (which parks on the predecessor and requires a separate + * skip_list_cursor_next), this folds the advance in and re-reads forward[0] so a concurrent + * skip_list_put that splices a node < target into the predecessor's forward[0] between the + * descent and the advance cannot leave the cursor on a key below target. + * @param cursor cursor + * @param key target key + * @param key_size size of target key + * @return 0 if positioned on a key >= target, -1 if no such key exists (cursor at end) + */ +int skip_list_cursor_seek_ge(skip_list_cursor_t *cursor, const uint8_t *key, size_t key_size); + +/** + * skip_list_cursor_seek_for_prev + * seeks cursor to last key <= target + * @param cursor cursor + * @param key target key + * @param key_size size of target key + * @return 0 on success, -1 on failure + */ +int skip_list_cursor_seek_for_prev(skip_list_cursor_t *cursor, const uint8_t *key, size_t key_size); + +/** + * skip_list_cursor_valid + * checks if cursor is at a valid position (not at sentinel) + * @param cursor cursor + * @return 1 if valid, 0 if not, -1 on error + */ +int skip_list_cursor_valid(const skip_list_cursor_t *cursor); + +/** + * skip_list_clear + * clears all entries from the skip list + * @param list skip list + * @return 0 on success, -1 on failure + */ +int skip_list_clear(skip_list_t *list); + +/** + * skip_list_free + * frees the skip list and all its nodes + * @param list skip list + */ +void skip_list_free(skip_list_t *list); + +/** + * skip_list_check_and_update_ttl + * checks and updates TTL for a node + * @param list skip list + * @param node node to check + * @return 0 on success, -1 on failure + */ +int skip_list_check_and_update_ttl(const skip_list_t *list, skip_list_node_t *node); + +/** + * skip_list_get_size + * gets total size of all entries + * @param list skip list + * @return total size in bytes + */ +size_t skip_list_get_size(skip_list_t *list); + +/** + * skip_list_count_entries + * counts number of entries in skip list + * @param list skip list + * @return number of entries + */ +int skip_list_count_entries(skip_list_t *list); + +/** + * skip_list_get_min_key + * gets the minimum key in the skip list + * @param list skip list + * @param key pointer to key pointer + * @param key_size pointer to key size + * @return 0 on success, -1 on failure + */ +int skip_list_get_min_key(skip_list_t *list, uint8_t **key, size_t *key_size); + +/** + * skip_list_get_max_key + * gets the maximum key in the skip list + * @param list skip list + * @param key pointer to key pointer + * @param key_size pointer to key size + * @return 0 on success, -1 on failure + */ +int skip_list_get_max_key(skip_list_t *list, uint8_t **key, size_t *key_size); + +#endif /* __SKIP_LIST_H__ */ \ No newline at end of file diff --git a/storage/tidesdb/libtidesdb/src/tidesdb.c b/storage/tidesdb/libtidesdb/src/tidesdb.c new file mode 100644 index 0000000000000..a9e679a461d17 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/tidesdb.c @@ -0,0 +1,35576 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tidesdb.h" + +#include +#include +#ifndef _WIN32 +#include +#endif + +#include "xxhash.h" + +/* read profiling macros */ +#ifdef TDB_ENABLE_READ_PROFILING +#define PROFILE_INC(db, field) atomic_fetch_add(&(db)->read_stats.field, 1) +#define PROFILE_ADD(db, field, val) atomic_fetch_add(&(db)->read_stats.field, val) +#else +#define PROFILE_INC(db, field) ((void)0) +#define PROFILE_ADD(db, field, val) ((void)0) +#endif + +/* global log level definition */ +_Atomic(int) _tidesdb_log_level = TDB_LOG_DEBUG; + +/* global log file pointer (NULL = stderr, non-NULL = file) */ +FILE *_tidesdb_log_file = NULL; + +/* global log truncation threshold (0 = no truncation) */ +size_t _tidesdb_log_truncate = 0; + +/* global log file path for truncation */ +char _tidesdb_log_path[MAX_FILE_PATH_LENGTH] = {0}; + +/* mutex to protect log file access during truncation */ +static pthread_mutex_t tidesdb_log_mutex = PTHREAD_MUTEX_INITIALIZER; + +typedef struct tidesdb_flush_work_t tidesdb_flush_work_t; +typedef struct tidesdb_compaction_work_t tidesdb_compaction_work_t; +typedef struct tidesdb_unified_flush_barrier_t tidesdb_unified_flush_barrier_t; +typedef tidesdb_memtable_t tidesdb_immutable_memtable_t; + +/* kv pair flags -- one uint8_t carrying two disjoint groups. + * + * PERSISTENT (0x01..0x10) describe the entry's data and are the ONLY bits that + * may reach disk; the klog serializer masks the byte with + * TDB_KV_FLAG_PERSISTENT_MASK so the transient group below can never leak. */ +#define TDB_KV_FLAG_TOMBSTONE 0x01 +#define TDB_KV_FLAG_HAS_TTL 0x02 +#define TDB_KV_FLAG_HAS_VLOG 0x04 +#define TDB_KV_FLAG_DELTA_SEQ \ + 0x08 /* serialization-only: seq is delta-encoded, stripped on read \ + */ +#define TDB_KV_FLAG_SINGLE_DELETE \ + 0x10 /* tombstone subtype -- caller promises the key was put \ + * at most once since the last single-delete or start, \ + * so put+single-delete can be dropped together at any \ + * compaction that sees both in the same merge input. \ + * always set alongside TDB_KV_FLAG_TOMBSTONE so every \ + * existing tombstone check keeps working unchanged. */ + +/* TRANSIENT (0x20..0x80) -- in-memory memory-ownership bookkeeping for + * tidesdb_kv_pair_free, never written to disk. kv_pair_create sets ARENA on + * every kv it builds (including compaction output), so these MUST be masked off + * before serialization. */ +#define TDB_KV_FLAG_POP_BUF 0x20 /* lives in reusable pop buffer, do not free */ +#define TDB_KV_FLAG_BORROWED 0x40 /* points into block data, do not free */ +#define TDB_KV_FLAG_ARENA 0x80 /* single struct+key+value allocation */ + +/* the kv entry-flag bits that describe tombstone-ness and must flow through + * every copy / materialisation path (pop_buf, inline_kv for sstable and + * memtable sources, kv_pair_create from an sstable entry, etc). using this + * single mask prevents forgetting the single-delete bit at a site that only + * remembered the plain tombstone bit. */ +#define TDB_KV_TOMBSTONE_FLAG_MASK (TDB_KV_FLAG_TOMBSTONE | TDB_KV_FLAG_SINGLE_DELETE) + +/* the persistent bits -- the only flags permitted onto disk. the klog serializer + * masks each entry's flag byte with this so the transient (in-memory) bits above + * cannot leak into the on-disk format (notably ARENA, which kv_pair_create sets + * on every compaction-written kv). */ +#define TDB_KV_FLAG_PERSISTENT_MASK \ + (TDB_KV_FLAG_TOMBSTONE | TDB_KV_FLAG_HAS_TTL | TDB_KV_FLAG_HAS_VLOG | TDB_KV_FLAG_DELTA_SEQ | \ + TDB_KV_FLAG_SINGLE_DELETE) + +/* the in-memory-only group as a mask. the deserialize path strips these (along + * with DELTA_SEQ) so a stray transient bit serialized by an OLDER build -- before + * the write path masked them -- cannot survive into an in-memory entry. */ +#define TDB_KV_FLAG_TRANSIENT_MASK (TDB_KV_FLAG_POP_BUF | TDB_KV_FLAG_BORROWED | TDB_KV_FLAG_ARENA) + +#define TDB_LOG_FILE "LOG" +#define TDB_WAL_PREFIX "wal_" +#define TDB_WAL_EXT ".log" +#define TDB_UNIFIED_WAL_PREFIX "uwal_" +#define TDB_UNIFIED_WAL_MAGIC 0x55AAU +#define TDB_UNIFIED_WAL_MAGIC_SIZE sizeof(uint16_t) +#define TDB_UNIFIED_CF_PREFIX_SIZE 4 +#define TDB_UNIFIED_SPLITS_INITIAL_CAP 8 +#define TDB_UNIFIED_CF_INDEX_MAP_FILE "UNIMAP" +#define TDB_UNIFIED_CF_INDEX_MAP_TMP "UNIMAP.tmp" +#define TDB_UNIFIED_CF_INDEX_MAP_INITIAL_CAP 8 +#define TDB_UNIFIED_CF_INDEX_MAP_LINE_MAX (TDB_MAX_CF_NAME_LEN + 32) +#define TDB_REPLICA_WAL_TMP "replica_wal_tmp.log" +#define TDB_REPLICA_MANIFEST_TMP "MANIFEST.replica_tmp" +#define TDB_PREFIXED_KEY_STACK_MAX 256 +#define TDB_BUP_CPY_FILE_SRC_MODE "rb" +#define TDB_BUP_CPY_FILE_DST_MODE "wb" + +#define TDB_CNF_FILE_MODE "w" + +/* stack-with-heap-fallback for prefixed keys */ +#define TDB_PREFIXED_KEY_ALLOC(name, total_size, stack_buf) \ + uint8_t stack_buf[TDB_PREFIXED_KEY_STACK_MAX]; \ + uint8_t *name = \ + ((total_size) <= TDB_PREFIXED_KEY_STACK_MAX) ? stack_buf : (uint8_t *)malloc(total_size) + +#define TDB_PREFIXED_KEY_FREE(name, stack_buf) \ + do \ + { \ + if ((name) != (stack_buf)) free(name); \ + } while (0) + +#define TDB_COLUMN_FAMILY_CONFIG_NAME "config" +#define TDB_COLUMN_FAMILY_MANIFEST_NAME "MANIFEST" +#define TDB_COLUMN_FAMILY_CONFIG_EXT ".ini" +#define TDB_LEVEL_PREFIX "L" +#define TDB_LEVEL_PARTITION_PREFIX "P" +#define TDB_SSTABLE_KLOG_EXT ".klog" +#define TDB_SSTABLE_VLOG_EXT ".vlog" +#define TDB_LOCK_FILE "LOCK" +#define TDB_CACHE_KEY_SIZE 64 +#define TDB_KLOG_BLOCK_STACK_ENTRIES 256 /* stack buffer size for small klog block index */ +#define TDB_BLOCK_INDEX_MAGIC 0x4B494459 /* "KIDY" -- indexed block cache header */ +#define TDB_BLOCK_INDEX_HDR_BASE 12 /* magic(4) + header_size(4) + num_entries(4) */ +#define TDB_BLOCK_INDEX_ENTRY_STRIDE \ + 20 /* entry_off(4) + key_off(4) + key_size(4) + seq_lo(4) + seq_hi(4) */ +#define TDB_BLOCK_IDX_ENTRY_OFF 0 /* offset of entry_off within index entry */ +#define TDB_BLOCK_IDX_KEY_OFF 4 /* offset of key_off within index entry */ +#define TDB_BLOCK_IDX_KEY_SIZE 8 /* offset of key_size within index entry */ +#define TDB_BLOCK_IDX_SEQ_LO 12 /* offset of abs_seq low 32 bits */ +#define TDB_BLOCK_IDX_SEQ_HI 16 /* offset of abs_seq high 32 bits */ +#define TDB_SSTABLE_METADATA_MAGIC 0x5353544D /* "SSTM" */ +#define TDB_SSTABLE_METADATA_HEADER_SIZE 84 +#define TDB_SSTABLE_METADATA_CHECKSUM_SIZE 8 +#define TDB_SSTABLE_METADATA_TOMBSTONE_SIZE 8 +/* btree-only metadata appended after max_key when SSTABLE_FLAG_BTREE is set + * btree_root_offset(8) + btree_first_leaf(8) + btree_last_leaf(8) + + * btree_node_count(8) + btree_height(4) */ +#define TDB_SSTABLE_METADATA_BTREE_SIZE 36 +/* chunked-aux descriptor appended after tombstone_count when SSTABLE_FLAG_CHUNKED_AUX + * is set, bloom_blob_offset(8) + bloom_blob_size(8) + index_blob_offset(8) + + * index_blob_size(8) */ +#define TDB_SSTABLE_METADATA_CHUNKED_AUX_SIZE 32 +#define TDB_SSTABLE_METADATA_FIXED_SIZE \ + (TDB_SSTABLE_METADATA_HEADER_SIZE + TDB_SSTABLE_METADATA_CHECKSUM_SIZE) +/* the largest payload a single block can frame -- the block manager's on-disk + * size field is a uint32, so block_manager_block_create rejects anything larger. + * a bloom-filter or block-index footer blob at or below this is written as ONE + * block, no chunking, SSTABLE_FLAG_CHUNKED_AUX stays clear, and the footer is + * byte-identical to (and readable by) older binaries. every bloom that can exist + * (m <= UINT32_MAX, ~900MB serialized) and every block index is well under this, + * so chunking is dormant for all real data today -- it only splits a blob that + * genuinely cannot fit one block, reachable only if bloom m is ever widened past + * 32-bit. chunking at any smaller size would needlessly fragment real footers and + * flip those sstables to the forward-incompatible chunked format for no benefit. */ +#define TDB_AUX_BLOCK_CHUNK_MAX ((uint64_t)UINT32_MAX) +/* sentinel for tidesdb_sstable_t.tombstone_count when the footer was written before + * SSTABLE_FLAG_TOMBSTONE_COUNT existed. trigger and stats code skip such sstables. */ +#define TDB_TOMBSTONE_COUNT_UNKNOWN UINT64_MAX +#define TDB_KLOG_BLOCK_SIZE (64 * 1024) +#define TDB_STACK_SSTS 64 +#define TDB_ITER_STACK_KEY_SIZE 256 +#define TDB_BACKUP_COPY_BUFFER_SIZE (256 * 1024) + +/* shift used to combine two uint32_t halves (sq_hi, sq_lo) back into the original uint64_t + * abs_seq stored in the block index. inverse of the (uint32_t)val / (uint32_t)(val >> 32) + * split at write time. */ +#define TDB_U64_HI_LO_SHIFT 32 + +/* initial capacity (and grow floor) for the iterator's double-buffered pop arena that + * lets merge_heap_pop materialise borrowed kvs without malloc. capacity grows to fit + * larger kvs but never shrinks below this. */ +#define TDB_MERGE_POP_BUF_INITIAL_CAP 256 + +/* extra room reserved at the end of a path buffer for the per-sstable suffix + * "_.{klog,vlog}" or the cf-level "/MANIFEST" / "/config.ini" suffixes appended by + * tidesdb_sstable_create and tdb_cold_start_download_worker. an unsigned 64-bit id + * decimal-encodes to 20 chars plus underscore, dot, and the extension. */ +#define TDB_PATH_SUFFIX_RESERVE 32 + +/* block cache key encoding -- the key is "cf_nameklog_filename" where + * the position is rendered as exactly 16 lowercase hex chars (2 nibbles per byte of a + * uint64_t) for fast appending without snprintf. */ +#define TDB_CACHE_KEY_SEPARATOR ':' +#define TDB_CACHE_KEY_HEX_DIGITS 16 + +/* initial capacity values for dynamic arrays */ +#define TDB_INITIAL_MERGE_HEAP_CAPACITY 16 +#define TDB_INITIAL_CF_CAPACITY 16 +#define TDB_INITIAL_COMPARATOR_CAPACITY 8 +#define TDB_INITIAL_TXN_OPS_CAPACITY 16 +#define TDB_INITIAL_TXN_READ_SET_CAPACITY 16 +#define TDB_INITIAL_TXN_CF_CAPACITY 4 +#define TDB_INITIAL_TXN_SAVEPOINT_CAPACITY 4 + +/* stack buffer sizes for hot-path allocations */ +#define TDB_STACK_IMM_SNAPSHOT 16 /* stack slots for unified immutable snapshot */ +#define TDB_RECOVER_IMM_SCAN_STACK 64 /* stack slots for the recovery max-seq immutable scan */ +#define TDB_STACK_COMMIT_HOOK_OPS 16 /* stack slots for commit hook operations */ +#define TDB_STACK_ITER_SOURCES 16 /* stack slots for iterator temp_sources */ + +/* default column family config values */ +#define TDB_INITIAL_BLOCK_INDEX_CAPACITY 16 + +/* worst-case bytes for a LEB128 varint encoding of a uint64_t -- 7 data bits per byte plus + * a continuation bit, ceil(64/7) = 10 */ +#define TDB_VARINT_MAX_BYTES 10 + +/* number of independent block-cache instances kept in step on memory pressure (block + * cache + btree node cache) and the fraction of resolved_memory_limit they're allowed + * to use together when clamping. the rest is left for memtables, bloom filters, and + * write ops. */ +#define TDB_BLOCK_CACHE_INSTANCES 2 +#define TDB_BLOCK_CACHE_MEM_FRACTION 0.30 + +/* create write set hash table at this many ops */ +#define TDB_TXN_WRITE_HASH_THRESHOLD 64 +/* create read set hash table at this many reads */ +#define TDB_TXN_READ_HASH_THRESHOLD 64 +/* scan last N ops for small txns */ +#define TDB_TXN_SMALL_SCAN_LIMIT 64 +/* grow read set by this amount */ +#define TDB_TXN_READ_SET_BATCH_GROW 256 +/* arena size for read key allocation (4KB) */ +#define TDB_TXN_READ_KEY_ARENA_SIZE 4096 +/* initial arena array capacity */ +#define TDB_TXN_READ_KEY_ARENA_INITIAL_CAPACITY 4 +/* batch transaction-memory publishes to db->txn_memory_bytes in chunks this large so the + * per-op write/read paths never hit the shared atomic; the global counter stays accurate to + * within roughly this much per large in-flight transaction */ +#define TDB_TXN_MEM_PUBLISH_THRESHOLD (256 * 1024) +/* initial capacity for active txn list */ +#define TDB_ACTIVE_TXN_INITIAL_CAPACITY 1024 +/* hash table capacity for write set (power of 2) */ +#define TDB_WRITE_SET_HASH_CAPACITY 2048 +/* hash table capacity for read set (power of 2) */ +#define TDB_READ_SET_HASH_CAPACITY 2048 +/* empty slot marker for write set hash */ +#define TDB_WRITE_SET_HASH_EMPTY (-1) +/* empty slot marker for read set hash */ +#define TDB_READ_SET_HASH_EMPTY (-1) +/* xxhash seed for transaction hash tables */ +#define TDB_TXN_HASH_SEED 0x9e3779b9 +/* max linear probe attempts before giving up */ +#define TDB_TXN_MAX_PROBE_LENGTH 32 + +#define TDB_TXN_DEDUP_SKIP_THRESHOLD 8 /* skip dedup hash for txns with fewer ops */ +#define TDB_TXN_DEDUP_MIN_HASH_SIZE 64 /* minimum hash size when dedup is used */ +#define TDB_TXN_DEDUP_HASH_MULTIPLIER 2 /* hash size = num_ops * multiplier */ +#define TDB_TXN_DEDUP_MAX_TRACKED 1024 /* max slots to track for fast iteration */ +#define TDB_MAX_TXN_OPS_BEFORE_BATCH 10 /* use batch methods when ops exceed this threshold */ + +/* flush and close retry configuration */ +#define TDB_FLUSH_ENQUEUE_MAX_ATTEMPTS 100 +#define TDB_FLUSH_ENQUEUE_BACKOFF_US 10000 +#define TDB_FLUSH_RETRY_DELAY_US 100000 +#define TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS 100 +#define TDB_CLOSE_FLUSH_WAIT_SLEEP_US 10000 +#define TDB_CLOSE_TXN_WAIT_SLEEP_US 1000 +#define TDB_COMPACTION_FLUSH_WAIT_SLEEP_US 10000 +#define TDB_CANCEL_BG_POLL_US 5000 /* 5ms poll while draining cancel */ +#define TDB_CANCEL_BG_MAX_WAIT_MS 30000 /* cap so a stuck merge can't hang */ +#define TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS 100 +#define TDB_CHECKPOINT_COMPACTION_WAIT_MAX_ATTEMPTS 200 +#define TDB_CHECKPOINT_COMPACTION_WAIT_SLEEP_US 50000 +#define TDB_OPENING_WAIT_MAX_MS 100 +#define TDB_MAX_FFLUSH_RETRY_ATTEMPTS 5 +#define TDB_FLUSH_RETRY_BACKOFF_US 100000 +#define TDB_SHUTDOWN_BROADCAST_ATTEMPTS 10 +#define TDB_SHUTDOWN_BROADCAST_INTERVAL_US 5000 + +/* thread name prefix for all tidesdb background threads (15 char limit on posix) */ +#define TDB_THREAD_PREFIX "tdb." +#define TDB_THREAD_NAME_LEN 16 + +/* sstable reaper thread configuration */ +#define TDB_SSTABLE_REAPER_SLEEP_US 100000 +/* how many cfs the reaper retries deferred flushes for in a single cycle */ +#define TDB_REAPER_DEFERRED_FLUSH_BATCH 64 +#define TDB_SSTABLE_REAPER_EVICT_RATIO 0.25 + +/* replica sync thread configuration */ +#define TDB_REPLICA_SYNC_DEFAULT_INTERVAL_US 5000000 +#define TDB_REPLICA_SYNC_SLEEP_SLICE_US 100000 + +/* default interval for unified WAL fsync escalation when the unified memtable + * is in TDB_SYNC_INTERVAL mode and unified_memtable_sync_interval_us is 0 */ +#define TDB_UNIFIED_WAL_SYNC_DEFAULT_INTERVAL_US 1000000 + +#define TDB_WAL_STACK_BUFFER_SIZE 512 + +/* deferred free configuration for retired sstable arrays + * when a level's sstable array is swapped (flush/compaction), the old array cannot be freed + * until all concurrent readers have finished. instead of spinning unboundedly, we try a brief + * spin and then defer the free to the reaper thread which sweeps periodically. */ +#define TDB_DEFERRED_FREE_SPIN_ATTEMPTS 64 /* brief spin before deferring */ + +/* immutable memtable cleanup configuration + * cleanup runs frequently to prevent memory exhaustion from old immutables + * only flushed immutables with no active readers are removed (safe cleanup) */ +#define TDB_IMMUTABLE_CLEANUP_THRESHOLD 2 /* check every 2 flushes */ +#define TDB_IMMUTABLE_MAX_QUEUE_SIZE 4 /* trigger cleanup when queue > 4 */ +#define TDB_IMMUTABLE_FORCE_CLEANUP_SIZE 8 /* run a cleanup pass once the queue reaches this */ +/* the immutable queue is bounded by the per-CF l0_queue_stall_threshold via writer + * backpressure. this headroom sits above that threshold as a last-resort hard cap -- + * a backstop only for the freeze/recovery paths that bypass backpressure. it scales + * WITH the configured threshold (see tdb_cf_immutable_hard_cap), so raising the + * threshold raises the cap in lockstep instead of being silently clamped. the + * lock-free snapshot array grows to match, so there is no hidden ceiling. */ +#define TDB_IMM_QUEUE_HEADROOM 6 +#define TDB_IMMUTABLE_HARD_CAP_WAIT_US \ + 1000 /* 1ms poll -- resume the blocked freeze promptly so \ + * the flush pipeline does not stall under load */ +#define TDB_IMMUTABLE_HARD_CAP_MAX_WAIT 5000 /* max 5s wait (5000 iterations * 1ms) */ + +/* refcount drain configuration for flush worker + * used when waiting for in-flight writers to finish before flushing memtable */ +#define TDB_REFCOUNT_DRAIN_SPIN_THRESHOLD 64 /* spin with cpu_pause up to this count */ +#define TDB_REFCOUNT_DRAIN_YIELD_THRESHOLD 1024 /* yield up to this count, then sleep */ +#define TDB_REFCOUNT_DRAIN_SLEEP_US 10 /* sleep interval after yield threshold */ +#define TDB_REFCOUNT_DRAIN_LOG_INTERVAL 0xFFFF /* log warning every ~64K iterations */ +#define TDB_REFCOUNT_DRAIN_BASELINE 2 /* baseline refcount -- 1 original + 1 work ref */ +#define TDB_ACTIVE_REF_MAX_ATTEMPTS \ + 16 /* bound on load+try_ref+revalidate retries when active is rotating */ + +/* default L0/L1 management configuration */ +#define TDB_DEFAULT_L1_FILE_COUNT_TRIGGER 4 +#define TDB_DEFAULT_L0_QUEUE_STALL_THRESHOLD 10 + +/* default tombstone density trigger configuration -- 0.0 disables the check, an sstable + * must hold at least TDB_DEFAULT_TOMBSTONE_DENSITY_MIN_ENTRIES entries before its density + * counts toward the trigger so tiny sstables can't cause spurious compactions */ +#define TDB_DEFAULT_TOMBSTONE_DENSITY_TRIGGER 0.0 +#define TDB_DEFAULT_TOMBSTONE_DENSITY_MIN_ENTRIES 1024 + +/* backpressure timing configuration + * */ +#define TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US 10000 /* 10ms between stall checks */ +#define TDB_BACKPRESSURE_STALL_MAX_ITERATIONS \ + 1000 /* ~10s poll budget at STALL_CHECK_INTERVAL_US; L0 stall counts consecutive no-progress \ + polls, memory-pressure stall counts total */ +#define TDB_BACKPRESSURE_HIGH_DELAY_US 2000 /* 2ms for high pressure */ +#define TDB_BACKPRESSURE_ELEVATED_DELAY_US 200 /* 0.2ms yield for elevated memory pressure */ +#define TDB_BACKPRESSURE_MODERATE_DELAY_US 500 /* 0.5ms for moderate pressure */ +#define TDB_BACKPRESSURE_HIGH_THRESHOLD_RATIO 0.8 /* 80% of stall threshold */ +#define TDB_BACKPRESSURE_MODERATE_THRESHOLD_RATIO 0.5 /* 50% of stall threshold */ +#define TDB_BACKPRESSURE_L1_HIGH_MULTIPLIER 4 /* 4x L1 trigger = high */ +#define TDB_BACKPRESSURE_L1_MODERATE_MULTIPLIER 3 /* 3x L1 trigger = moderate */ +/* active memtable hard ceiling as a multiple of write_buffer_size. the + * commit-time threshold check allows up to 1.5x for batching headroom; this + * leaves a small overshoot margin above that before apply_backpressure + * stalls the writer until rotation completes */ +#define TDB_BACKPRESSURE_ACTIVE_MT_CEILING_MULT 2 + +/* backpressure stall warnings (ceiling stall, immutable-queue-critical) are emitted from the + * write/flush hot paths -- a per-event log floods under sustained backpressure (every stalling + * writer, every flush completion). throttle each to at most one line per CF per this many seconds + * so the condition stays visible without drowning the log. see tdb_log_throttle. */ +#define TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC 1 + +/* global memory pressure configuration (computed by reaper, consumed by write path) + * graduated response based on ratio of used memory to resolved_memory_limit */ +#define TDB_MEMORY_PRESSURE_NORMAL 0 /* < 60% -- no action */ +#define TDB_MEMORY_PRESSURE_ELEVATED 1 /* 60-75% -- reduce flush headroom to 0 */ +#define TDB_MEMORY_PRESSURE_HIGH 2 /* 75-95% -- force flush + write delay */ +#define TDB_MEMORY_PRESSURE_CRITICAL 3 /* >= 95% -- block writes, emergency flush */ +#define TDB_MEMORY_PRESSURE_ELEVATED_RATIO 0.60 /* ratio threshold for elevated */ +#define TDB_MEMORY_PRESSURE_HIGH_RATIO 0.75 /* ratio threshold for high */ +#define TDB_MEMORY_PRESSURE_CRITICAL_RATIO 0.95 /* ratio threshold for critical */ +#define TDB_MEMORY_AUTO_LIMIT_RATIO 0.50 /* auto limit = 50% of total memory */ +#define TDB_MEMORY_MIN_LIMIT_RATIO 0.05 /* minimum limit = 5% of total memory */ +#define TDB_MEMORY_OS_CHECK_INTERVAL 50 +#define TDB_MEMORY_OS_CRITICAL_RATIO 0.05 /* OS critically low if < N% free */ + +/* a single block read (e.g. a bloom-filter footer block) is refused if its + * payload would exceed this fraction of resolved_memory_limit -- one block must + * never claim half the database's whole memory budget. pushed into the block + * manager via block_manager_set_max_safe_block_bytes. */ +#define TDB_MEMORY_MAX_BLOCK_FRACTION_DENOM 2 + +/* lock-free immutable snapshot configuration */ +#define TDB_IMM_SNAP_ACQUIRE_SPIN_LIMIT 4 /* spins before yielding in snapshot acquire */ + +/* sstable retry backoff configuration (exponential backoff for retry_level) */ +#define TDB_SST_RETRY_INITIAL_SPINS 1 /* initial cpu_pause count on first retry */ +#define TDB_SST_RETRY_MAX_SPINS 16 /* maximum cpu_pause count per retry */ +#define TDB_SST_RETRY_MAX_LEVEL_RETRIES 4 /* max full level restarts before skipping dead ssts */ + +/* file-open descriptor-pressure handling. block_manager_open can fail with EMFILE/ENFILE when the + * process is momentarily at its open-fd ceiling (many sstables open under heavy flush+compaction). + * tidesdb_bm_open treats that as transient backpressure, it wakes the reaper to close idle sstables + * and retries a bounded number of times before failing, so an fd spike does not permanently wedge + * flush or compaction. all other errors (and success) return immediately. */ +#define TDB_BM_OPEN_EMFILE_MAX_RETRIES 5 +#define TDB_BM_OPEN_EMFILE_BACKOFF_US \ + 20000 /* 20ms between retries -- reaper evicts idle ssts on wake */ + +/* sstable fd budget. each open sstable holds two descriptors (klog + vlog). at open we bound + * max_open_sstables so 2*cap fits under the process open-file limit, reserving headroom for WALs, + * the manifest, object-store handles, and stdio. the floor keeps the db usable even on a tiny + * descriptor limit (the EMFILE retry in tidesdb_bm_open then absorbs transient overshoot). */ +#define TDB_FDS_PER_SSTABLE 2 +#define TDB_FD_RESERVE_NON_SSTABLE 64 /* descriptors reserved for WALs/manifest/objstore/stdio */ +#define TDB_MIN_OPEN_SSTABLES 4 /* never clamp the sstable budget below this */ + +/* reader fd reservation. point reads and iterators may open NEW sstables only while + * num_open_sstables stays under max_open_sstables minus this reserve, which is held for the + * flush / compaction / commit-conflict-check paths -- those MUST make progress to relieve fd + * pressure, whereas a read can degrade to a retryable error (see the scan / iter open-failure + * paths). this bounds reader-induced opens so writes never starve, preventing the fd wedge. */ +#define TDB_FD_READER_RESERVE_DIVISOR 8 /* reserve = max_open_sstables / this ... */ +#define TDB_FD_READER_RESERVE_MIN 16 /* ... but at least this many sstables ... */ +#define TDB_FD_READER_RESERVE_MAX_DIVISOR \ + 2 /* ... and never more than max_open_sstables / this, so reads keep at least half the \ + * fd budget even when max_open_sstables is smaller than the reserve floor */ + +/* sstable reaper eviction sentinel -- set on refcount during block manager close + * to prevent concurrent try_ref from acquiring a reference on an evicting sstable */ +#define TDB_REFCOUNT_EVICTING (-1) +#define TDB_EVICT_WAIT_MAX \ + 8192 /* max escalating-backoff iters waiting out a transient reaper \ + * eviction before giving up. a normal block_manager_close \ + * clears in microseconds; the >YIELD_THRESHOLD tail sleeps \ + * ~10us each, capping the wait near 70ms -- far longer than any \ + * close, so a live evicting sstable is always waited out. */ + +/* time conversion constants for pthread_cond_timedwait */ +#define TDB_MICROSECONDS_PER_SECOND 1000000 +#define TDB_NANOSECONDS_PER_SECOND 1000000000 +#define TDB_NANOSECONDS_PER_MICROSECOND 1000 + +#define TDB_MAX_TXN_CFS 256 +#define TDB_MAX_CF_DISCOVERY 256 +#define TDB_MAX_PATH_LEN 4096 +#define TDB_MAX_TXN_OPS INT_MAX +#define TDB_MEMORY_PERCENTAGE 0.6 +#define TDB_MIN_KEY_VALUE_SIZE (1024 * 1024) +#define TDB_MIN_LEVEL_SSTABLES_INITIAL_CAPACITY 32 +#define TDB_DISK_SPACE_CHECK_INTERVAL_SECONDS 60 +#define TDB_NO_CF_SYNC_SLEEP_US 100000 + +/* object store retry constants */ +#define TDB_UPLOAD_MAX_RETRIES 3 +#define TDB_UPLOAD_INITIAL_BACKOFF_US 100000 /* 100ms */ +#define TDB_UPLOAD_BACKOFF_MULTIPLIER 4 /* 100ms -> 400ms -> 1600ms */ +#define TDB_DOWNLOAD_MAX_RETRIES 3 +#define TDB_DOWNLOAD_INITIAL_BACKOFF_US 50000 /* 50ms */ +#define TDB_DOWNLOAD_BACKOFF_MULTIPLIER 4 /* 50ms -> 200ms -> 800ms */ +#define TDB_LIST_MAX_RETRIES 4 +#define TDB_LIST_INITIAL_BACKOFF_US 50000 /* 50ms */ + +/* klog block configuration */ +#define TDB_KLOG_BLOCK_INITIAL_CAPACITY 512 + +/* block index validation */ +#define TDB_BLOCK_INDEX_PREFIX_MIN 4 +#define TDB_BLOCK_INDEX_PREFIX_MAX 256 +#define TDB_BLOCK_INDEX_MAX_COUNT INT_MAX + +/* empty block index placeholder ( 4 byte LE count (0) followed by 1 byte prefix_len ) */ +#define TDB_EMPTY_BLOCK_INDEX_SIZE 5 + +/* merge and serialization configuration */ +#define TDB_MERGE_MIN_ESTIMATED_ENTRIES 100 +#define TDB_KLOG_DELTA_SEQ_MAX_DIFF 1000000 + +/* range cost estimation model weights (relative, used by tidesdb_range_cost) */ +#define TDB_RANGE_COST_COMPRESSION_WEIGHT 1.5 /* block read plus decompress multiplier */ +#define TDB_RANGE_COST_PER_ENTRY_WEIGHT 0.01 /* per sstable entry processing cost */ +#define TDB_RANGE_COST_PER_SOURCE_WEIGHT 0.5 /* merge heap overhead per overlapping source */ +#define TDB_RANGE_COST_MEMTABLE_WEIGHT 0.001 /* per active memtable entry cost */ + +/* tidesdb_get_stats average entry size split between key and value */ +#define TDB_STATS_AVG_KEY_FRACTION 0.3 +#define TDB_STATS_AVG_VALUE_FRACTION 0.7 + +/* iterator seek configuration */ +/* max blocks to scan during seek */ +#define TDB_ITER_SEEK_MAX_BLOCKS_SCAN 100000 + +#define TDB_COMMIT_STATUS_BUFFER_SIZE 65536 + +/* uint32_t max value */ +#define TDB_MAX_KEY_VALUE_SIZE UINT32_MAX + +/** + * tidesdb_deferred_free_node_t + * node in lock-free singly-linked list for deferred reclamation of retired sstable arrays + * pushed by flush/compaction workers, swept by reaper thread + * @param ptr pointer to the retired array to free + * @param level level whose array_readers must reach 0 before freeing + * @param sst_unrefs optional array of sstable pointers to unref when freed + * @param sst_unrefs_count number of entries in sst_unrefs + * @param db database handle needed for sstable_unref (only when sst_unrefs_count > 0) + * @param next pointer to next node in the deferred free list + */ +struct tidesdb_deferred_free_node_t +{ + void *ptr; + tidesdb_level_t *level; + tidesdb_sstable_t **sst_unrefs; + int sst_unrefs_count; + const tidesdb_t *db; + struct tidesdb_deferred_free_node_t *next; +}; + +/** + * tidesdb_klog_entry_t + * entry in klog block + * @param flags entry flags (tombstone, ttl, vlog, delta_seq) + * @param key_size size of key in bytes + * @param value_size size of value in bytes + * @param ttl time-to-live timestamp + * @param seq sequence number + * @param vlog_offset offset in vlog file (0 if inline) + */ +typedef struct +{ + uint8_t flags; + uint32_t key_size; + uint32_t value_size; + int64_t ttl; + uint64_t seq; + uint64_t vlog_offset; +} tidesdb_klog_entry_t; + +/** + * tidesdb_cached_entry_t + * cached entry structure for lock-free block cache + * stores deserialized, decompressed entry with key and value/vlog_offset + * @param flags entry flags (tombstone, ttl, vlog, delta_seq) + * @param key_size size of key in bytes + * @param value_size size of value in bytes (actual value size, not inline size) + * @param ttl time-to-live timestamp + * @param seq sequence number + * @param vlog_offset offset in vlog file (0 if inline, >0 if in vlog) + * @param data flexible array [key_data][value_data if inline] + */ +typedef struct +{ + uint8_t flags; + uint32_t key_size; + uint32_t value_size; + int64_t ttl; + uint64_t seq; + uint64_t vlog_offset; +#ifdef _MSC_VER + uint8_t data[1]; /* MSVC requires size 1 */ +#else + uint8_t data[]; /* flexible array */ +#endif +} tidesdb_cached_entry_t; + +/** + * tidesdb_multi_cf_txn_metadata_t + * metadata for multi-cf transaction entries + * written before klog_entry when entry has multi-cf flag + * @param num_participant_cfs number of column families in transaction + * @param checksum xxhash64 checksum of num_participant_cfs + cf_names + * followed by char cf_names[num_participant_cfs][TDB_MAX_CF_NAME_LEN] (null-terminated cf names) + */ +#pragma pack(push, 1) +typedef struct +{ + uint8_t num_participant_cfs; + uint64_t checksum; +} tidesdb_multi_cf_txn_metadata_t; +#pragma pack(pop) + +/* tidesdb_kv_arena_t + * bump arena for a klog block's per-entry key and value copies on the write path. + * allocations come from reusable chunks that are reset between blocks, so filling a + * block needs no per-entry malloc or free. chunks are never moved once allocated, so + * pointers handed out stay valid until the arena is reset or destroyed. + * @param chunks chunk base pointers + * @param sizes per-chunk capacity + * @param count number of allocated chunks + * @param cap capacity of the chunks/sizes arrays + * @param cur current chunk being filled + * @param off bump offset within the current chunk + */ +#define TDB_KLOG_ARENA_CHUNK (128 * 1024) /* default chunk size */ +#define TDB_KLOG_ARENA_ALIGN 8 /* allocation alignment */ +#define TDB_KLOG_ARENA_INIT_CHUNKS 4 /* initial chunks/sizes array capacity */ +typedef struct +{ + uint8_t **chunks; + size_t *sizes; + int count; + int cap; + int cur; + size_t off; +} tidesdb_kv_arena_t; + +/** + * tidesdb_klog_block_t + * a block in the klog containing multiple key entries + * @param num_entries number of entries in this block + * @param block_size total size of this block + * @param capacity allocated capacity for arrays + * @param is_arena_allocated 1 if arena-allocated (deserialized), 0 if separate mallocs (created) + * @param is_zero_copy 1 if keys/values point into external buffer (no copy during deserialize) + * @param entries array of entries + * @param keys array of key data + * @param inline_values array of inline values (null if in vlog) + * @param max_key maximum key in this block + * @param max_key_size size of maximum key + * @param data_ref owned reference to external data buffer (freed on block_free if non-NULL) + * @param kv_arena bump arena holding the per-entry key/value copies (write path only) + */ +typedef struct +{ + uint32_t num_entries; + uint32_t block_size; + uint32_t capacity; + uint8_t is_arena_allocated; + uint8_t is_zero_copy; + tidesdb_klog_entry_t *entries; + uint8_t **keys; + uint8_t **inline_values; + uint8_t *max_key; + size_t max_key_size; + uint8_t *data_ref; + tidesdb_kv_arena_t kv_arena; +} tidesdb_klog_block_t; + +/** + * tidesdb_block_index_t + * compact block index for fast key lookups + * stores min/max key prefixes and file positions for each block + * @param min_key_prefixes array of minimum key prefixes + * @param max_key_prefixes array of maximum key prefixes + * @param file_positions array of file positions for each block + * @param count number of blocks indexed + * @param capacity capacity of arrays + * @param prefix_len length of key prefix stored + * @param comparator comparator function for key ordering + * @param comparator_ctx comparator context + */ +struct tidesdb_block_index_t +{ + uint8_t *min_key_prefixes; + uint8_t *max_key_prefixes; + uint64_t *file_positions; + uint32_t count; + uint32_t capacity; + uint8_t prefix_len; + tidesdb_comparator_fn comparator; + void *comparator_ctx; +}; + +/** + * tidesdb_vlog_block_t + * a block in the vlog containing multiple values + * @param num_values number of values in this block + * @param block_size total size of this block + * @param value_sizes array of value sizes + * @param values array of value data + */ +typedef struct +{ + uint32_t num_values; + uint32_t block_size; + uint32_t *value_sizes; + uint8_t **values; +} tidesdb_vlog_block_t; + +/** + * tidesdb_kv_pair_t + * key-value pair + * @param entry klog entry + * @param key key data + * @param value value data + */ +struct tidesdb_kv_pair_t +{ + tidesdb_klog_entry_t entry; + uint8_t *key; + uint8_t *value; +}; + +#define TDB_COMMIT_STATUS_IN_PROGRESS 0 +#define TDB_COMMIT_STATUS_COMMITTED 1 + +/** + * tidesdb_commit_status_t + * @param status array of commit statuses (0=in-progress, 1=committed) + * @param min_seq minimum sequence number tracked in this buffer + * @param max_seq maximum sequence number tracked in this buffer + * @param capacity size of the status array + */ +struct tidesdb_commit_status_t +{ + _Atomic(uint8_t) *status; + _Atomic(uint64_t) min_seq; + _Atomic(uint64_t) max_seq; + size_t capacity; +}; + +/** + * tidesdb_flush_work_t + * work item for flush thread pool + * @param cf column family + * @param imm immutable memtable wrapper (holds refcount) + * @param sst_id sstable id + * @param unified_sl the shared unified immutable skip list for a unified split task. when set + * (alongside unified_barrier) the worker writes this cf's prefix segment of it + * straight to an sstable. borrowed, NOT freed by the worker -- the immutable owns + * it and the barrier's last finisher releases it. + * @param unified_cf_index the cf_index prefix identifying this cf's run in unified_sl + * @param unified_entry_count node count of the run, for sizing the sstable bloom/index + * @param unified_barrier shared barrier across sibling per cf split tasks of + * a single unified memtable flush. last finisher closes + * the unified wal and frees the barrier. + */ +struct tidesdb_flush_work_t +{ + tidesdb_column_family_t *cf; + tidesdb_immutable_memtable_t *imm; + uint64_t sst_id; + skip_list_t *unified_sl; + uint32_t unified_cf_index; + int unified_entry_count; + tidesdb_unified_flush_barrier_t *unified_barrier; +}; + +/** + * tidesdb_unified_flush_barrier_t + * shared completion state for a unified memtable flush split into per cf tasks. + * the dispatcher initialises remaining to the number of per cf tasks it enqueues. + * each task does its own sstable write then decrements remaining. the task that + * brings remaining to zero owns the unified wal cleanup and the barrier free. + * @param remaining per cf tasks still in flight + * @param overall_result first non-success error reported by any task + * @param umt_imm unified immutable memtable being flushed + * @param db database instance + */ +struct tidesdb_unified_flush_barrier_t +{ + atomic_int remaining; + atomic_int overall_result; + tidesdb_memtable_t *umt_imm; + tidesdb_t *db; +}; + +/** + * tidesdb_compaction_work_t + * work item for compaction thread pool + * @param cf column family + * @param start_level starting level + * @param target_level target level + * @param steer_to_bottom when set, the worker runs a targeted merge of + * [steer_min_key, steer_max_key] into the largest level + * instead of the geometry-driven spooky compaction -- + * used to push a tombstone-dense sstable down to where + * regular tombstones can finally drop + * @param full_compaction when set, the worker merges every level into the + * largest level (a true manual full compaction) instead + * of one geometry-driven spooky round -- reclaims all + * tombstones and single-delete pairs regardless of + * whether any level is over capacity + * @param steer_min_key malloc'd copy of the dense sstable's min key (worker frees) + * @param steer_min_key_size size of steer_min_key + * @param steer_max_key malloc'd copy of the dense sstable's max key (worker frees) + * @param steer_max_key_size size of steer_max_key + * @param done_mu when non-NULL, the worker signals done_flag + broadcasts done_cv under done_mu on + * every exit path that consumes this work item, so a blocking caller can park on the signal until + * the work is serviced or discarded + * @param done_cv paired with done_mu + * @param done_flag paired with done_mu + */ +struct tidesdb_compaction_work_t +{ + tidesdb_column_family_t *cf; + int start_level; + int target_level; + int steer_to_bottom; + int full_compaction; + uint8_t *steer_min_key; + size_t steer_min_key_size; + uint8_t *steer_max_key; + size_t steer_max_key_size; + pthread_mutex_t *done_mu; + pthread_cond_t *done_cv; + _Atomic(int) *done_flag; +}; + +/** + * tidesdb_txn_op_t + * operation structure for transactions + * @param key key + * @param key_size key size + * @param value value + * @param value_size value size + * @param ttl time-to-live + * @param is_delete delete flag (set for both regular and single-delete tombstones) + * @param is_single_delete single-delete flag (implies is_delete) + * @param cf column family (for multi-cf transactions) + */ +struct tidesdb_txn_op_t +{ + uint8_t *key; + size_t key_size; + uint8_t *value; + size_t value_size; + time_t ttl; + int is_delete; + int is_single_delete; + tidesdb_column_family_t *cf; +}; + +/* forward declaration for ref-counted block type */ +typedef struct tidesdb_ref_counted_block_t tidesdb_ref_counted_block_t; + +/** + * tidesdb_merge_source_t + * is a source for merging (memtable, sstable, or transaction write buffer) + * @param type type of source (memtable, sstable, btree, or txn_ops) + * @param source union of source-specific state + * @param current_kv current key-value pair + * @param config column family configuration + * @param is_cached if 1, dont free when popped from heap (for iterators) + */ +typedef struct +{ + enum + { + MERGE_SOURCE_MEMTABLE, + MERGE_SOURCE_SSTABLE, + MERGE_SOURCE_BTREE, + MERGE_SOURCE_TXN_OPS, + MERGE_SOURCE_UNIFIED_MEMTABLE + } type; + + union + { + struct + { + skip_list_cursor_t *cursor; + tidesdb_immutable_memtable_t *imm; + } memtable; + + struct + { + tidesdb_t *db; + tidesdb_sstable_t *sst; + block_manager_cursor_t *klog_cursor; + block_manager_cursor_t *vlog_cursor; + tidesdb_klog_block_t *current_block; + block_manager_block_t *current_block_data; + tidesdb_ref_counted_block_t *current_rc_block; + uint8_t *decompressed_data; + clock_cache_entry_t *cache_pin; /* zero-copy cache pin (holds reader ref) */ + int current_entry_idx; + /* 2-slot deserialized block stash -- this avoids re-parsing varint headers + * when alternating between 2 blocks (A-B-A-B pattern). + * slots are written round-robin, checked linearly. */ + struct + { + tidesdb_klog_block_t *block; + clock_cache_entry_t *pin; + uint64_t position; + } block_stash[2]; + /* lazy block -- raw bytes not yet deserialized. + * seek uses O(log N) binary search on raw bytes instead of + * O(N) full varint deserialization. full deserialize is + * deferred to first next()/prev() call. + * data may be owned by a cache pin (cache hit) or by + * bmblock/decompressed (disk read). */ + struct + { + const uint8_t *data; /* raw data pointer */ + size_t size; /* raw data size */ + clock_cache_entry_t *pin; /* cache pin keeping data alive */ + const uint8_t *block_data; /* data past block index header */ + size_t block_data_size; + const uint8_t *idx_base; /* block index entries */ + uint32_t idx_count; /* number of index entries */ + int entry_idx; /* found entry index for next/prev */ + block_manager_block_t *bmblock; /* disk-read block ownership */ + uint8_t *decompressed; /* decompressed buffer ownership */ + } lazy; + } sstable; + + struct + { + tidesdb_t *db; + tidesdb_sstable_t *sst; + btree_cursor_t *cursor; + block_manager_cursor_t *vlog_cursor; + } btree; + + /* transaction write buffer source for read-your-own-writes + * sorted_indices is an array of indices into txn->ops, sorted by key + * and deduplicated (last write per key wins) */ + struct + { + tidesdb_txn_t *txn; + tidesdb_column_family_t *cf; + int *sorted_indices; + int count; + int pos; + } txn_ops; + + /* unified memtable source with CF-prefix filtering. + * the unified skip list has keys prefixed with 4-byte BE CF index. + * this source filters to only the target CF and strips the prefix + * when returning keys to the iterator. */ + struct + { + skip_list_cursor_t *cursor; + tidesdb_immutable_memtable_t *imm; + uint32_t cf_index; + uint8_t prefix[4]; /* TDB_UNIFIED_CF_PREFIX_SIZE */ + } unified; + } source; + + tidesdb_kv_pair_t *current_kv; + tidesdb_kv_pair_t inline_kv; /* embedded kv for zero-copy borrowed mode */ + tidesdb_column_family_config_t *config; + int is_cached; +} tidesdb_merge_source_t; + +/** + * tidesdb_merge_heap_t + * min-heap for efficient multi-way merge + * @param sources array of merge sources + * @param num_sources number of sources + * @param capacity capacity of sources array + * @param comparator comparator function for sorting + * @param comparator_ctx comparator context + */ +struct tidesdb_merge_heap_t +{ + tidesdb_merge_source_t **sources; + int num_sources; + int capacity; + skip_list_comparator_fn comparator; + void *comparator_ctx; + uint8_t *pop_buf[2]; /* double-buffered arena for borrowed KV materialization */ + size_t pop_buf_cap[2]; + int pop_buf_slot; /* active slot (toggled by iterator between next/prev calls) */ +}; + +/** + * tidesdb_log_write + * writes a log message to the log file or stderr + * handles truncation if configured + * @param level log level + * @param file source file name + * @param line source line number + * @param fmt format string + * @param ... format arguments + */ +void tidesdb_log_write(const int level, const char *file, const int line, const char *fmt, ...) +{ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + + const time_t sec = ts.tv_sec; + struct tm tm_info; + tdb_gmtime_r(&sec, &tm_info); + + const char *level_str = (level == TDB_LOG_DEBUG) ? "DEBUG" + : (level == TDB_LOG_INFO) ? "INFO" + : (level == TDB_LOG_WARN) ? "WARN" + : (level == TDB_LOG_ERROR) ? "ERROR" + : "FATAL"; + + pthread_mutex_lock(&tidesdb_log_mutex); + + FILE *log_out = _tidesdb_log_file ? _tidesdb_log_file : stderr; + + fprintf(log_out, "[%04d-%02d-%02dT%02d:%02d:%02d.%03dZ] [%s] %s:%d: ", tm_info.tm_year + 1900, + tm_info.tm_mon + 1, tm_info.tm_mday, tm_info.tm_hour, tm_info.tm_min, tm_info.tm_sec, + (int)(ts.tv_nsec / 1000000), level_str, file, line); + + va_list args; + va_start(args, fmt); + if (fmt) vfprintf(log_out, fmt, args); + va_end(args); + + fprintf(log_out, "\n"); + + if (_tidesdb_log_file) + { + fflush(_tidesdb_log_file); + + if (_tidesdb_log_truncate > 0 && _tidesdb_log_path[0] != '\0') + { + const long current_pos = ftell(_tidesdb_log_file); + if (current_pos > 0 && (size_t)current_pos >= _tidesdb_log_truncate) + { + fclose(_tidesdb_log_file); + _tidesdb_log_file = fopen(_tidesdb_log_path, TDB_CNF_FILE_MODE); + if (_tidesdb_log_file) + { + tdb_setlinebuf(_tidesdb_log_file); + fprintf(_tidesdb_log_file, "[LOG TRUNCATED - exceeded %zu bytes]\n", + _tidesdb_log_truncate); + fflush(_tidesdb_log_file); + } + } + } + } + + pthread_mutex_unlock(&tidesdb_log_mutex); +} + +/** + * tdb_log_throttle + * rate-limit a hot-path log line. returns 1 at most once per interval_sec for a given + * last_log_sec slot, 0 otherwise. a CAS picks a single winner among concurrent callers so the + * line is emitted once per window even under many simultaneous writers/flushers. + * uses db->cached_current_time (maintained by the reaper) to avoid a clock syscall on the hot path. + * @param db database (source of cached time) + * @param last_log_sec per-CF / per-mode atomic holding the last emit time in seconds + * @param interval_sec minimum seconds between emissions + * @return 1 if the caller should emit now, 0 to suppress + */ +static int tdb_log_throttle(tidesdb_t *db, _Atomic(time_t) *last_log_sec, int interval_sec) +{ + const time_t now = atomic_load_explicit(&db->cached_current_time, memory_order_relaxed); + time_t last = atomic_load_explicit(last_log_sec, memory_order_relaxed); + if (now - last < interval_sec) return 0; + return atomic_compare_exchange_strong_explicit(last_log_sec, &last, now, memory_order_relaxed, + memory_order_relaxed); +} + +/** + * tidesdb_wake_reaper + * nudge the sstable reaper to run its eviction pass now so it closes idle (unreferenced) sstable + * block managers and reclaims their file descriptors. uses trylock -- if the reaper mutex is held + * (reaper mid-cycle, or we are on the reaper thread itself) the signal is skipped, which is safe: + * the reaper runs on its own 100ms timer regardless. never blocks the caller. + * @param db database instance + */ +static void tidesdb_wake_reaper(tidesdb_t *db) +{ + if (pthread_mutex_trylock(&db->reaper_thread_mutex) == 0) + { + pthread_cond_signal(&db->reaper_thread_cond); + pthread_mutex_unlock(&db->reaper_thread_mutex); + } +} + +/** + * tidesdb_bm_open + * open a block manager, treating file-descriptor exhaustion (EMFILE/ENFILE) as transient + * backpressure rather than a hard failure, wake the reaper to close idle sstables and retry a + * bounded number of times. every other errno (and success) returns immediately. errno is preserved + * by block_manager_open across its own cleanup, so the EMFILE/ENFILE check sees the real cause. + * @param db database instance (for waking the reaper) + * @param bm out-- opened block manager + * @param path file path + * @param sync_mode block-manager sync mode (already converted) + * @return 0 on success, -1 on failure (errno set) + */ +static int tidesdb_bm_open(tidesdb_t *db, block_manager_t **bm, const char *path, int sync_mode) +{ + for (int attempt = 0;; attempt++) + { + if (block_manager_open(bm, path, sync_mode) == 0) return 0; + if ((errno != EMFILE && errno != ENFILE) || attempt >= TDB_BM_OPEN_EMFILE_MAX_RETRIES) + return -1; + /* fd table is full but idle sstables can usually be closed -- wake the reaper and give it + * a moment to reclaim descriptors before retrying. */ + tidesdb_wake_reaper(db); + usleep(TDB_BM_OPEN_EMFILE_BACKOFF_US); + } +} + +/** + * tidesdb_sstable_open_budget + * the descriptor budget for resident open sstables, max_open_sstables minus the reserve held for + * flush/compaction. both the reader admission check and the reaper's eviction trigger use this, so + * the reaper keeps num_open_sstables at or below the budget the readers stop at -- otherwise reads + * would back off in the [budget, max_open) gap while the reaper (triggering only at max_open) frees + * nothing, starving reads with no relief on fd-constrained hosts. + */ +static int tidesdb_sstable_open_budget(const tidesdb_t *db) +{ + const int max_open = (int)db->config.max_open_sstables; + int reserve = max_open / TDB_FD_READER_RESERVE_DIVISOR; + if (reserve < TDB_FD_READER_RESERVE_MIN) reserve = TDB_FD_READER_RESERVE_MIN; + /* cap the reserve so it never starves reads when max_open_sstables is below the floor */ + const int reserve_cap = max_open / TDB_FD_READER_RESERVE_MAX_DIVISOR; + if (reserve > reserve_cap) reserve = reserve_cap; + int budget = max_open - reserve; + if (budget < 1) budget = 1; + return budget; +} + +/** + * tidesdb_reader_fd_budget_ok + * gate a reader (point-get / iterator) about to open a NOT-yet-open sstable against the reader fd + * budget = max_open_sstables - reserve, the reserve being held for flush / compaction / + * conflict-check (the priority paths that must progress to relieve fd pressure). an already-open + * sstable needs no new descriptor, so re-reads are never blocked. when over budget, wake the reaper + * to reclaim idle sstables and recheck; if still over, the caller fails the read with a retryable + * error rather than starving the write path or returning wrong data (the scan/iter open-failure + * paths surface it). returns 1 if the reader may open, 0 if it must back off. + * @param db database instance + * @param sst sstable the reader is about to open + * @return 1 if ok to open (or already open), 0 if over the reader budget + */ +static int tidesdb_reader_fd_budget_ok(tidesdb_t *db, tidesdb_sstable_t *sst) +{ + /* already counted -- num_open_sstables is keyed on the klog, so a klog-open sstable + * needs no new tracked descriptor and is never blocked (the lazy vlog rides along) */ + if (atomic_load_explicit(&sst->klog_bm, memory_order_acquire)) return 1; + + /* reads may open up to max_open_sstables -- the open-file clamp keeps that descriptor-safe, and + * respecting it (rather than opening every source unbounded) is what prevents the original + * full-scan fd-exhaustion wedge. the reaper evicts IDLE sstables down to the smaller + * open-budget (max_open - reserve), so [open_budget, max_open) is burst headroom for active + * reads and compaction; a read only backs off with a retryable error at the hard cap. a + * k-way-merge iterator needs its whole source set open at once, so it must use this full cap + * too -- a smaller per-read reserve would make any scan over more than (budget) sstables + * impossible. */ + const int max_open = (int)db->config.max_open_sstables; + + if (atomic_load_explicit(&db->num_open_sstables, memory_order_relaxed) < max_open) return 1; + + /* over budget for a new open -- give the reaper a chance to reclaim idle sstables, then recheck + */ + tidesdb_wake_reaper(db); + usleep(TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US); + return atomic_load_explicit(&db->num_open_sstables, memory_order_relaxed) < max_open; +} + +/** + * tidesdb_txn_op_sl_flags + * compute the skip-list version flag bitmask for a txn op. + * a live put is 0, a regular delete is SKIP_LIST_FLAG_DELETED, and a + * single-delete is both bits together so SKIP_LIST_FLAG_DELETED checks + * keep treating it as a tombstone. + */ +static inline uint8_t tidesdb_txn_op_sl_flags(const tidesdb_txn_op_t *op) +{ + if (op->is_single_delete) return SKIP_LIST_FLAG_DELETED | SKIP_LIST_FLAG_SINGLE_DELETE; + if (op->is_delete) return SKIP_LIST_FLAG_DELETED; + return 0; +} + +/** + * tidesdb_sl_flags_to_kv_flags + * translate skip-list version flag bits into tidesdb kv_pair entry flags. + * the two namespaces overlap on the tombstone bit (both are 0x01) but the + * single-delete bit sits in different positions (0x02 on the skip list, + * 0x10 in the kv_pair flag byte) because kv_pair flags are persisted on + * disk and share the byte with serialization-time markers. + */ +static inline uint8_t tidesdb_sl_flags_to_kv_flags(uint8_t sl_flags) +{ + uint8_t kv = 0; + if (sl_flags & SKIP_LIST_FLAG_DELETED) kv |= TDB_KV_FLAG_TOMBSTONE; + if (sl_flags & SKIP_LIST_FLAG_SINGLE_DELETE) kv |= TDB_KV_FLAG_SINGLE_DELETE; + return kv; +} + +/** + * tidesdb_txn_op_kv_flags + * compute the tidesdb kv_pair tombstone flag bits for a txn op. + * used when materialising a txn op as a kv_pair for a merge source. + */ +static inline uint8_t tidesdb_txn_op_kv_flags(const tidesdb_txn_op_t *op) +{ + if (op->is_single_delete) return TDB_KV_FLAG_TOMBSTONE | TDB_KV_FLAG_SINGLE_DELETE; + if (op->is_delete) return TDB_KV_FLAG_TOMBSTONE; + return 0; +} + +/** + * tidesdb_commit_status_create + * creates a new commit status tracker + * @return commit status tracker or NULL on error + */ +static tidesdb_commit_status_t *tidesdb_commit_status_create() +{ + tidesdb_commit_status_t *cs = malloc(sizeof(tidesdb_commit_status_t)); + if (!cs) return NULL; + + cs->status = malloc(TDB_COMMIT_STATUS_BUFFER_SIZE * sizeof(_Atomic(uint8_t))); + if (!cs->status) + { + free(cs); + return NULL; + } + + /* we init all slots as in-progress (will be updated as txns complete) */ + for (size_t i = 0; i < TDB_COMMIT_STATUS_BUFFER_SIZE; i++) + { + atomic_init(&cs->status[i], TDB_COMMIT_STATUS_IN_PROGRESS); + } + + atomic_init(&cs->min_seq, 1); + atomic_init(&cs->max_seq, 0); + cs->capacity = TDB_COMMIT_STATUS_BUFFER_SIZE; + + return cs; +} + +/** + * tidesdb_commit_status_destroy + * destroys a commit status tracker + * @param cs commit status tracker + */ +static void tidesdb_commit_status_destroy(tidesdb_commit_status_t *cs) +{ + if (!cs) return; + free((void *)cs->status); + free(cs); +} + +/** + * tidesdb_commit_status_mark + * marks a sequence as committed + * @param cs commit status tracker + * @param seq sequence number + * @param status TDB_COMMIT_STATUS_COMMITTED or TDB_COMMIT_STATUS_IN_PROGRESS + */ +static void tidesdb_commit_status_mark(tidesdb_commit_status_t *cs, uint64_t seq, uint8_t status) +{ + if (!cs || seq == 0) return; + + uint64_t current_max = atomic_load_explicit(&cs->max_seq, memory_order_acquire); + while (seq > current_max) + { + if (atomic_compare_exchange_weak_explicit(&cs->max_seq, ¤t_max, seq, + memory_order_release, memory_order_acquire)) + { + break; /* successfully updated */ + } + /* CAS failed, current_max was updated by atomic_compare_exchange_weak, retry */ + } + + size_t idx = seq % cs->capacity; + atomic_store_explicit(&cs->status[idx], status, memory_order_release); +} + +/** + * tidesdb_visibility_check_callback + * callback for skip list to check if a sequence is committed + * used by skip_list_get_with_seq for visibility determination + * @param opaque_ctx commit_status pointer (cast from void*) + * @param seq sequence number to check + * @return 1 if committed, 0 otherwise + */ +static int tidesdb_visibility_check_callback(void *opaque_ctx, const uint64_t seq) +{ + if (!opaque_ctx || seq == 0) return 0; + + tidesdb_commit_status_t *cs = (tidesdb_commit_status_t *)opaque_ctx; + + /* we map seq to circular buffer index */ + const size_t idx = seq % cs->capacity; + uint8_t status = atomic_load_explicit(&cs->status[idx], memory_order_acquire); + + /* only COMMITTED versions are visible */ + return (status == TDB_COMMIT_STATUS_COMMITTED); +} + +/** + * encode_varint + * encode uint64_t as varint (1-10 bytes) + * @param buf output buffer (must have at least 10 bytes) + * @param value value to encode + * @return number of bytes written + */ +static inline int encode_varint(uint8_t *buf, uint64_t value) +{ + int pos = 0; + while (value >= 0x80) + { + buf[pos++] = (uint8_t)(value | 0x80); + value >>= 7; + } + buf[pos++] = (uint8_t)value; + return pos; +} + +/** + * decode_varint + * decode varint to uint64_t + * @param buf input buffer + * @param value output value + * @param max_bytes maximum bytes to read (bounds check) + * @return number of bytes read, or -1 on error + */ +static inline int decode_varint(const uint8_t *buf, uint64_t *value, const int max_bytes) +{ + if (TDB_UNLIKELY(max_bytes <= 0)) return -1; + + /* fast path for 1-byte varints (values < 128) -- most common case */ + if (TDB_LIKELY(!(buf[0] & 0x80))) + { + *value = buf[0]; + return 1; + } + + /* slow path for multi-byte varints */ + *value = (uint64_t)(buf[0] & 0x7F); + int shift = 7; + int pos = 1; + + while (pos < max_bytes) + { + const uint8_t byte = buf[pos++]; + *value |= (uint64_t)(byte & 0x7F) << shift; + + if ((byte & 0x80) == 0) + { + return pos; /* success */ + } + + shift += 7; + if (shift >= 64) + { + return -1; /* oflow */ + } + } + + return -1; /* incomplete varint */ +} + +static inline void tdb_encode_be32(const uint32_t val, uint8_t *out) +{ + out[0] = (uint8_t)(val >> 24); + out[1] = (uint8_t)(val >> 16); + out[2] = (uint8_t)(val >> 8); + out[3] = (uint8_t)(val); +} + +static inline uint32_t tdb_decode_be32(const uint8_t *p) +{ + return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) | ((uint32_t)p[2] << 8) | (uint32_t)p[3]; +} + +static inline size_t tdb_build_prefixed_key(const uint32_t cf_index, const uint8_t *key, + const size_t key_size, uint8_t *out) +{ + tdb_encode_be32(cf_index, out); + memcpy(out + TDB_UNIFIED_CF_PREFIX_SIZE, key, key_size); + return TDB_UNIFIED_CF_PREFIX_SIZE + key_size; +} + +/** + * tdb_parse_wal_id + * parse WAL ID from filename like "wal_12345.log" + * @param filename the filename to parse + * @param id output WAL ID + * @return 1 on success, 0 on failure + */ +static int tdb_parse_wal_id(const char *filename, uint64_t *id) +{ + if (!filename || !id) return 0; + + const size_t prefix_len = strlen(TDB_WAL_PREFIX); + if (strncmp(filename, TDB_WAL_PREFIX, prefix_len) != 0) return 0; + + const char *p = filename + prefix_len; + char *endptr; + + const unsigned long long val = strtoull(p, &endptr, 10); + if (endptr == p) return 0; + + if (strcmp(endptr, TDB_WAL_EXT) != 0) return 0; + + *id = (uint64_t)val; + return 1; +} + +/** + * tdb_parse_unified_wal_gen + * parse unified WAL generation from filename like "uwal_12345.log" + * @param filename the filename to parse + * @param gen output WAL generation + * @return 1 on success, 0 on failure + */ +static int tdb_parse_unified_wal_gen(const char *filename, uint64_t *gen) +{ + if (!filename || !gen) return 0; + + const size_t prefix_len = strlen(TDB_UNIFIED_WAL_PREFIX); + if (strncmp(filename, TDB_UNIFIED_WAL_PREFIX, prefix_len) != 0) return 0; + + const char *p = filename + prefix_len; + char *endptr; + + const unsigned long long val = strtoull(p, &endptr, 10); + if (endptr == p) return 0; + + if (strcmp(endptr, TDB_WAL_EXT) != 0) return 0; + + *gen = (uint64_t)val; + return 1; +} + +/** + * tdb_parse_level_num + * parse level number from filename like "L5_..." + * @param filename the filename to parse + * @param level_num output level number + * @return 1 on success, 0 on failure + */ +static int tdb_parse_level_num(const char *filename, int *level_num) +{ + if (!filename || !level_num) return 0; + + const size_t prefix_len = strlen(TDB_LEVEL_PREFIX); + if (strncmp(filename, TDB_LEVEL_PREFIX, prefix_len) != 0) return 0; + + const char *p = filename + prefix_len; + char *endptr; + const long val = strtol(p, &endptr, 10); + if (endptr == p) return 0; + + *level_num = (int)val; + return 1; +} + +/** + * tdb_parse_sstable_non_partitioned + * parse non-partitioned sstable filename like "L5_12345.klog" + * @param filename the filename to parse + * @param level_num output level number + * @param sst_id output sstable id + * @return 1 on success, 0 on failure + */ +static int tdb_parse_sstable_non_partitioned(const char *filename, int *level_num, + unsigned long long *sst_id) +{ + if (!filename || !level_num || !sst_id) return 0; + + const size_t prefix_len = strlen(TDB_LEVEL_PREFIX); + if (strncmp(filename, TDB_LEVEL_PREFIX, prefix_len) != 0) return 0; + + const char *p = filename + prefix_len; + char *endptr; + const long level = strtol(p, &endptr, 10); + if (endptr == p || *endptr != '_') return 0; + + p = endptr + 1; + const unsigned long long id = strtoull(p, &endptr, 10); + if (endptr == p) return 0; + + if (strcmp(endptr, TDB_SSTABLE_KLOG_EXT) != 0) return 0; + + *level_num = (int)level; + *sst_id = id; + return 1; +} + +/** + * tdb_parse_sstable_partitioned + * parse partitioned sstable filename like "L5P2_12345.klog" + * @param filename the filename to parse + * @param level_num output level number + * @param partition_num output partition number + * @param sst_id output sstable id + * @return 1 on success, 0 on failure + */ +static int tdb_parse_sstable_partitioned(const char *filename, int *level_num, int *partition_num, + unsigned long long *sst_id) +{ + if (!filename || !level_num || !partition_num || !sst_id) return 0; + + const size_t level_prefix_len = strlen(TDB_LEVEL_PREFIX); + if (strncmp(filename, TDB_LEVEL_PREFIX, level_prefix_len) != 0) return 0; + + const char *p = filename + level_prefix_len; + char *endptr; + const long level = strtol(p, &endptr, 10); + if (endptr == p) return 0; + + const size_t partition_prefix_len = strlen(TDB_LEVEL_PARTITION_PREFIX); + if (strncmp(endptr, TDB_LEVEL_PARTITION_PREFIX, partition_prefix_len) != 0) return 0; + + p = endptr + partition_prefix_len; + const long partition = strtol(p, &endptr, 10); + if (endptr == p || *endptr != '_') return 0; + + p = endptr + 1; + const unsigned long long id = strtoull(p, &endptr, 10); + if (endptr == p) return 0; + + if (strcmp(endptr, TDB_SSTABLE_KLOG_EXT) != 0) return 0; + + *level_num = (int)level; + *partition_num = (int)partition; + *sst_id = id; + return 1; +} + +static tidesdb_klog_block_t *tidesdb_klog_block_create(void); +static void tidesdb_klog_block_free(tidesdb_klog_block_t *block); +static int tidesdb_klog_block_add_entry(tidesdb_klog_block_t *block, const tidesdb_kv_pair_t *kv, + const tidesdb_column_family_config_t *config, + skip_list_comparator_fn comparator_fn, + void *comparator_ctx); +static int tidesdb_klog_block_is_full(const tidesdb_klog_block_t *block, size_t max_size); +static int tidesdb_klog_block_serialize(tidesdb_klog_block_t *block, uint8_t **out, + size_t *out_size); +static int tidesdb_klog_block_seek_raw(const uint8_t *data, size_t data_size, + const uint8_t *target_key, size_t target_key_size, + skip_list_comparator_fn comparator_fn, void *comparator_ctx, + tidesdb_klog_entry_t *out_entry, const uint8_t **out_key, + const uint8_t **out_value, int *out_idx, + uint32_t *out_num_entries); +static int tidesdb_klog_block_deserialize(const uint8_t *data, size_t data_size, + tidesdb_klog_block_t **block, int zero_copy); + +/** + * tidesdb_block_managers_t + * temporary structure to hold block manager pointers retrieved from cache + * @param klog_bm klog block manager + * @param vlog_bm value log block manager + */ +typedef struct +{ + block_manager_t *klog_bm; + block_manager_t *vlog_bm; +} tidesdb_block_managers_t; + +static int tidesdb_sstable_get_block_managers(const tidesdb_t *db, tidesdb_sstable_t *sst, + tidesdb_block_managers_t *bms); +static int tidesdb_vlog_read_value(const tidesdb_t *db, tidesdb_sstable_t *sst, + uint64_t vlog_offset, size_t value_size, uint8_t **value); +static tidesdb_sstable_t *tidesdb_sstable_create(tidesdb_t *db, const char *base_path, uint64_t id, + const tidesdb_column_family_config_t *config); +static void tidesdb_sstable_free(tidesdb_sstable_t *sst); + +static void compact_block_index_free(tidesdb_block_index_t *index); +static int compact_block_index_find_predecessor(const tidesdb_block_index_t *index, + const uint8_t *key, size_t key_len, + uint64_t *file_position); +static int compact_block_index_find_slot(const tidesdb_block_index_t *index, const uint8_t *key, + size_t key_len, int64_t *slot); +static uint32_t compact_block_index_run_length(const tidesdb_block_index_t *index, + const uint8_t *key, size_t key_len, + int64_t start_slot); +static int compact_block_index_add(tidesdb_block_index_t *index, const uint8_t *min_key, + size_t min_key_len, const uint8_t *max_key, size_t max_key_len, + uint64_t file_position); +static tidesdb_block_index_t *compact_block_index_create(uint32_t initial_capacity, + uint8_t prefix_len, + tidesdb_comparator_fn comparator, + void *comparator_ctx); +static uint8_t *compact_block_index_serialize(const tidesdb_block_index_t *index, size_t *out_size); +static tidesdb_block_index_t *compact_block_index_deserialize(const uint8_t *data, + size_t data_size); +static void tidesdb_sstable_ref(tidesdb_sstable_t *sst); +static int tidesdb_sstable_try_ref(tidesdb_sstable_t *sst); +static void tidesdb_sstable_unref(const tidesdb_t *db, tidesdb_sstable_t *sst); +static uint64_t tidesdb_min_active_snapshot_seq(tidesdb_t *db); +static int tidesdb_sstable_write_from_memtable(tidesdb_t *db, tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, skip_list_t *memtable); +static int tidesdb_sstable_get(tidesdb_t *db, tidesdb_sstable_t *sst, const uint8_t *key, + size_t key_size, uint64_t seq_ceiling, tidesdb_kv_pair_t **kv, + int skip_bloom); +static int tidesdb_sstable_get_seq(tidesdb_t *db, tidesdb_sstable_t *sst, const uint8_t *key, + size_t key_size, uint64_t *out_seq); +static int tidesdb_sstable_load(tidesdb_t *db, tidesdb_sstable_t *sst); +static tidesdb_level_t *tidesdb_level_create(int level_num, size_t capacity); +static void tidesdb_level_free(const tidesdb_t *db, tidesdb_level_t *level); +static int64_t tidesdb_sstable_aux_memory_bytes(const tidesdb_sstable_t *sst); +static int tidesdb_level_add_sstable(tidesdb_level_t *level, tidesdb_sstable_t *sst); +static int tidesdb_level_remove_sstable(const tidesdb_t *db, tidesdb_level_t *level, + tidesdb_sstable_t *sst); +static int tidesdb_level_update_boundaries(tidesdb_level_t *level, tidesdb_level_t *largest_level); +static int tidesdb_level_sort_by_min_key(tidesdb_t *db, tidesdb_level_t *level, + skip_list_comparator_fn cmp, void *cmp_ctx); +static tidesdb_merge_heap_t *tidesdb_merge_heap_create(skip_list_comparator_fn comparator, + void *comparator_ctx); +static void tidesdb_merge_heap_free(tidesdb_merge_heap_t *heap); +static int tidesdb_merge_heap_add_source(tidesdb_merge_heap_t *heap, + tidesdb_merge_source_t *source); +static tidesdb_kv_pair_t *tidesdb_merge_heap_pop(tidesdb_merge_heap_t *heap, + tidesdb_sstable_t **corrupted_sst); +static int tidesdb_merge_heap_empty(const tidesdb_merge_heap_t *heap); +static tidesdb_merge_source_t *tidesdb_merge_source_from_memtable( + skip_list_t *memtable, tidesdb_column_family_config_t *config, + tidesdb_immutable_memtable_t *imm); +static tidesdb_merge_source_t *tidesdb_merge_source_from_sstable_klog(tidesdb_t *db, + tidesdb_sstable_t *sst); +static tidesdb_merge_source_t *tidesdb_merge_source_from_btree(tidesdb_t *db, + tidesdb_sstable_t *sst); +static int tidesdb_btree_read_vlog_value(block_manager_cursor_t *vlog_cursor, uint64_t vlog_offset, + const tidesdb_column_family_config_t *config, + uint8_t **value_out, size_t *value_size_out, + size_t expected_value_size); +static void tidesdb_iter_clear_block_stash(tidesdb_merge_source_t *source); +static void tidesdb_iter_clear_lazy(tidesdb_merge_source_t *source); +static tidesdb_column_family_t *tidesdb_get_column_family_internal(tidesdb_t *db, const char *name); +static void tdb_replica_discover_new_cfs(tidesdb_t *db); +static tidesdb_merge_source_t *tidesdb_merge_source_from_sstable(tidesdb_t *db, + tidesdb_sstable_t *sst); +static void tidesdb_merge_source_free(tidesdb_merge_source_t *source); +static int tidesdb_merge_source_advance(tidesdb_merge_source_t *source); +static int tidesdb_merge_source_retreat(tidesdb_merge_source_t *source); +static int tidesdb_full_preemptive_merge(tidesdb_column_family_t *cf, int start_level, + int target_level, int output_level); +static int tidesdb_dividing_merge(tidesdb_column_family_t *cf, int target_level); +static int tidesdb_partitioned_merge(tidesdb_column_family_t *cf, const int start_level, + const int end_level); +static int tidesdb_targeted_merge(tidesdb_column_family_t *cf, tidesdb_sstable_t **inputs, + int input_count, int min_input_level, int max_input_level, + int target_level); +static int tidesdb_compact_range_internal(tidesdb_column_family_t *cf, const uint8_t *start_key, + size_t start_key_size, const uint8_t *end_key, + size_t end_key_size, int target_level_override); +static int tidesdb_compact_steer_to_bottom(tidesdb_column_family_t *cf, uint8_t *min_key, + size_t min_key_size, uint8_t *max_key, + size_t max_key_size); +static int tdb_partitioned_merge_finalize_sst(tidesdb_column_family_t *cf, tidesdb_sstable_t *sst, + block_manager_t *klog_bm, block_manager_t *vlog_bm, + bloom_filter_t *bloom, + tidesdb_block_index_t *block_indexes, + uint64_t entry_count, uint64_t tombstone_count, + uint64_t klog_block_num, uint64_t vlog_block_num, + uint64_t max_seq, int end_level, int partition); +static int tidesdb_sstable_write_from_heap_btree(tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, tidesdb_merge_heap_t *heap, + block_manager_t *klog_bm, block_manager_t *vlog_bm, + bloom_filter_t *bloom, queue_t *sstables_to_delete, + int is_largest_level); +static int tidesdb_trigger_compaction(tidesdb_column_family_t *cf, int full_compaction); +static int tidesdb_enqueue_compaction(tidesdb_column_family_t *cf, int full_compaction); +static int tidesdb_compact_internal(tidesdb_column_family_t *cf, int full_compaction, int blocking); +static int tidesdb_wal_recover(tidesdb_column_family_t *cf, const char *wal_path, + skip_list_t **memtable); +static int tidesdb_wal_replay_into(tidesdb_column_family_t *cf, block_manager_t *wal, + skip_list_t *target); +static size_t tidesdb_calculate_level_capacity(int level_num, size_t base_capacity, size_t ratio); + +static int tidesdb_add_level(tidesdb_column_family_t *cf); +static int tidesdb_remove_level(tidesdb_column_family_t *cf); +static int tidesdb_apply_dca(tidesdb_column_family_t *cf); +static int tidesdb_recover_database(tidesdb_t *db); +static int tidesdb_recover_column_family(tidesdb_column_family_t *cf); +static void tidesdb_column_family_free(tidesdb_column_family_t *cf); +static int tidesdb_unimap_load(tidesdb_t *db); +static int tidesdb_unimap_persist(tidesdb_t *db); +static void tidesdb_unimap_objstore_pull(tidesdb_t *db, int overwrite); +static void tidesdb_unimap_resolve(tidesdb_t *db, const char *name, uint32_t *out_index, + int *out_is_new); +static void tidesdb_unimap_remove(tidesdb_t *db, const char *name); +static void tidesdb_unimap_rename(tidesdb_t *db, const char *old_name, const char *new_name); +static void tidesdb_unimap_free(tidesdb_t *db); + +/** + * tidesdb_worker_thread_arg_t + * thread argument for pooled workers (flush/compaction) to pass db handle and thread index + * @param db database handle + * @param index thread index within the pool + */ +typedef struct +{ + tidesdb_t *db; + int index; +} tidesdb_worker_thread_arg_t; + +static void *tidesdb_flush_worker_thread(void *arg); +static int tidesdb_unified_flush_immutable(tidesdb_t *db, tidesdb_memtable_t *umt_imm); +static int tidesdb_unified_write_cf_sstable(tidesdb_t *db, tidesdb_column_family_t *cf, + skip_list_t *unified_sl, uint32_t cf_index, + int entry_count); +static void tidesdb_unified_flush_barrier_finish(tidesdb_unified_flush_barrier_t *barrier); +static void tidesdb_immutable_memtable_unref(tidesdb_immutable_memtable_t *imm); +static int tidesdb_unified_memtable_rotate(tidesdb_t *db); +static void *tidesdb_compaction_worker_thread(void *arg); +static void tidesdb_ensure_btree_node_cache(tidesdb_t *db); +static void *tidesdb_sync_worker_thread(void *arg); +static void *tidesdb_reaper_thread(void *arg); +static void *tidesdb_replica_sync_thread(void *arg); +static tidesdb_kv_pair_t *tidesdb_kv_pair_create(const uint8_t *key, size_t key_size, + const uint8_t *value, size_t value_size, + time_t ttl, uint64_t seq, uint8_t tombstone_flags); +static void tidesdb_kv_pair_free(tidesdb_kv_pair_t *kv); +static int tidesdb_iter_kv_visible(tidesdb_iter_t *iter, tidesdb_kv_pair_t *kv); +static int tidesdb_sstable_ensure_open(tidesdb_t *db, tidesdb_sstable_t *sst); +static int tidesdb_sstable_ensure_klog_open(tidesdb_t *db, tidesdb_sstable_t *sst); +static int tidesdb_sstable_ensure_vlog_open(tidesdb_t *db, tidesdb_sstable_t *sst); +static int wait_for_open(tidesdb_t *db); + +/** + * tidesdb_cf_abort_requested + * in-flight COMPACTIONS call this at per-key (or per-partition) checkpoints so they + * bail without finishing the merge -- used by both drop_column_family (CF going away) + * and tidesdb_cancel_background_work (db-wide compaction cancel for a fast shutdown). + * either way the merge discards its uncommitted output and leaves inputs intact, so + * abort is safe. acquire pairs with the release stores in + * tidesdb_drop_column_family_internal and tidesdb_cancel_background_work. + * NOTE: flush checkpoints deliberately do not use this -- they check + * marked_for_deletion directly so a cancel never aborts an in-flight flush (flushes + * are the durability path and always complete). + * @param cf column family + * @return non-zero if this CF's compaction should abort + */ +static inline int tidesdb_cf_abort_requested(const tidesdb_column_family_t *cf) +{ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire) != 0) return 1; + return (cf->db && atomic_load_explicit(&cf->db->cancel_compaction, memory_order_acquire) != 0); +} + +/** + * tdb_cf_effective_l1_trigger + * file-count threshold for L1 compaction. object-store mode with lazy compaction wants + * to absorb more L1 files before triggering to amortise remote I/O, so the threshold + * doubles in that case. callers must use this everywhere they previously compared + * against l1_file_count_trigger directly, otherwise backpressure and compaction logic + * drift out of agreement (e.g. backpressure would throttle the writer before the lazy + * compaction even queues). + * @param cf column family + * @return effective L1 file-count trigger + */ +static inline int tdb_cf_effective_l1_trigger(const tidesdb_column_family_t *cf) +{ + int trigger = cf->config.l1_file_count_trigger; + if (cf->db && cf->db->object_store && cf->config.object_lazy_compaction) trigger *= 2; + return trigger; +} + +/** + * tdb_cf_effective_stall + * L0 stall threshold scaled for multi-CF deployments. the configured value assumes + * single-CF usage; with N CFs sharing the global memory budget the per-CF cap is + * memory_limit / (N * write_buffer_size), clamped to a minimum of 2. callers use + * this in the apply_backpressure ladder and the adaptive-flush threshold so both + * sites see the same scaled value. + * @param cf column family + * @return effective stall threshold (≥ 2) + */ +static inline size_t tdb_cf_effective_stall(const tidesdb_column_family_t *cf) +{ + size_t stall = (size_t)cf->config.l0_queue_stall_threshold; + /* floor at 1 -- a configured 0 would make `depth >= stall` true even on an empty + * queue, stalling every commit until the BUSY timeout. matches the floor in + * tdb_cf_immutable_hard_cap. the multi-CF branch below applies its own (>=2) floor. */ + if (stall < 1) stall = 1; + if (cf->db) + { + /* unified mode has a single shared immutable queue, so the per-CF + * memory-budget split below does not apply -- use the configured value */ + if (cf->db->unified_mt.enabled) return stall; + + const int num_cfs = cf->db->num_column_families; + if (num_cfs > 1) + { + const size_t mem_limit = + atomic_load_explicit(&cf->db->resolved_memory_limit, memory_order_relaxed); + const size_t arena_size = cf->config.write_buffer_size; + if (mem_limit > 0 && arena_size > 0) + { + const size_t per_cf_budget = mem_limit / ((size_t)num_cfs * arena_size); + if (per_cf_budget < stall) stall = per_cf_budget < 2 ? 2 : per_cf_budget; + } + } + } + return stall; +} + +/** + * tdb_cf_immutable_hard_cap + * last-resort ceiling on the immutable-queue depth, the configured L0 stall + * threshold plus headroom for in-flight freezes. derived from config so that + * raising l0_queue_stall_threshold raises this in lockstep -- there is no hidden + * constant ceiling that silently clamps a larger configured threshold. the + * lock-free snapshot array (tidesdb_imm_snap_publish_locked) grows to fit, so + * the queue is bounded by the operator's threshold, never by a fixed array size. + * @param cf column family + * @return hard-cap depth, always greater than the stall threshold + */ +static inline size_t tdb_cf_immutable_hard_cap(const tidesdb_column_family_t *cf) +{ + size_t stall = (size_t)cf->config.l0_queue_stall_threshold; + if (stall < 1) stall = 1; + return stall + TDB_IMM_QUEUE_HEADROOM; +} + +/** + * tidesdb_txn_mem_publish + * reflect a transaction's accumulated op + arena memory into the database-wide + * txn_memory_bytes counter, but only in coarse threshold-sized batches so the + * per-op write/read paths never touch the shared atomic. mem_bytes is mutated + * only by the owning thread (a txn is single-threaded), so it needs no atomic; + * the global counter is updated by the net delta when it crosses the threshold. + * the full published amount is reconciled back at txn free/reset, so this can + * never drift the global counter even if a per-op delta is mis-estimated. + * @param txn the transaction whose memory delta to (maybe) publish + */ +static inline void tidesdb_txn_mem_publish(tidesdb_txn_t *txn) +{ + const int64_t delta = txn->mem_bytes - txn->mem_published; + if (delta >= TDB_TXN_MEM_PUBLISH_THRESHOLD || delta <= -TDB_TXN_MEM_PUBLISH_THRESHOLD) + { + atomic_fetch_add_explicit(&txn->db->txn_memory_bytes, delta, memory_order_relaxed); + txn->mem_published = txn->mem_bytes; + } +} + +/** + * tdb_unified_dispatch_skip_segment + * advance cursor past every remaining entry whose 4-byte cf_index prefix matches cf_index. + * used by the unified flush dispatcher when the resolved CF is gone (transition lookup + * failed) or when the CF was marked for deletion mid-segment. cheaper than letting the + * outer dispatcher loop iterate the segment one entry at a time with its full branching. + * @param cursor cursor positioned somewhere inside the segment to skip + * @param cf_index 4-byte big-endian CF prefix that identifies the segment + * @return 1 if cursor now points at the first entry with a different prefix (caller + * should reprocess it), 0 if the cursor exhausted the skip list + */ +static int tdb_unified_dispatch_skip_segment(skip_list_cursor_t *cursor, uint32_t cf_index) +{ + uint8_t *raw_key, *value; + size_t raw_key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + while (skip_list_cursor_next(cursor) == 0) + { + if (skip_list_cursor_get_with_seq(cursor, &raw_key, &raw_key_size, &value, &value_size, + &ttl, &deleted, &seq) != 0) + return 0; + if (raw_key_size < TDB_UNIFIED_CF_PREFIX_SIZE) return 0; + if (tdb_decode_be32(raw_key) != cf_index) return 1; + } + return 0; +} + +/** + * tdb_cf_flush_match + * queue_remove_if predicate matching flush work items that target the given CF. + * unified umt-imm dispatch items have work->cf == NULL and are never matched. + */ +static int tdb_cf_flush_match(void *data, void *context) +{ + const tidesdb_flush_work_t *work = (const tidesdb_flush_work_t *)data; + return work && work->cf == (const tidesdb_column_family_t *)context; +} + +/** + * tdb_cf_flush_release + * queue_remove_if on_remove handler for swept flush work. mirrors the worker's + * marked-for-deletion skip path so counters stay balanced. + */ +static void tdb_cf_flush_release(void *data, void *context) +{ + (void)context; + tidesdb_flush_work_t *work = (tidesdb_flush_work_t *)data; + if (!work) return; + + tidesdb_column_family_t *cf = work->cf; + tidesdb_t *db = cf ? cf->db : NULL; + + if (work->unified_barrier) + { + /* unified_sl is borrowed by the work item (the immutable owns it) -- just drop our share + * of the barrier so the last finisher can still close the unified wal */ + tidesdb_unified_flush_barrier_finish(work->unified_barrier); + } + else if (work->imm) + { + tidesdb_immutable_memtable_unref(work->imm); + if (db) atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + } + + if (db) atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + if (cf) atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + free(work); +} + +/** + * tdb_cf_compaction_match + * queue_remove_if predicate matching compaction work items that target the given CF. + */ +static int tdb_cf_compaction_match(void *data, void *context) +{ + const tidesdb_compaction_work_t *work = (const tidesdb_compaction_work_t *)data; + return work && work->cf == (const tidesdb_column_family_t *)context; +} + +/** + * tdb_cf_compaction_release + * queue_remove_if on_remove handler for swept compaction work. mirrors the worker's + * marked-for-deletion skip path. + */ +static void tdb_cf_compaction_release(void *data, void *context) +{ + (void)context; + tidesdb_compaction_work_t *work = (tidesdb_compaction_work_t *)data; + if (!work) return; + if (work->cf) + atomic_fetch_sub_explicit(&work->cf->compaction_pending_count, 1, memory_order_release); + free(work->steer_min_key); + free(work->steer_max_key); + free(work); +} + +/** + * tidesdb_ref_counted_block_t + * reference-counted wrapper for deserialized blocks (thread-safe shared access) + * @member block pointer to deserialized block + * @member ref_count number of active references + * @member block_memory memory footprint for accounting + */ +struct tidesdb_ref_counted_block_t +{ + tidesdb_klog_block_t *block; + atomic_int ref_count; + size_t block_memory; +}; + +/** + * tidesdb_block_release + * decrement reference count and free if no more references + * @param rc_block block to release + */ +static void tidesdb_block_release(tidesdb_ref_counted_block_t *rc_block) +{ + if (!rc_block) return; + + int old_count = atomic_fetch_sub_explicit(&rc_block->ref_count, 1, memory_order_release); + if (old_count == 1) + { + /* last reference released, its safe to free */ + atomic_thread_fence(memory_order_acquire); + if (rc_block->block) + { + tidesdb_klog_block_free(rc_block->block); + } + free(rc_block); + } +} + +/** + * tidesdb_cache_evict_block + * eviction callback for block cache -- no-op since we now cache raw bytes + * which are stored inline in the clock_cache entry and freed automatically. + * kept as a named function for documentation and future extensibility. + * @param payload pointer to raw block bytes being evicted + * @param payload_len size of payload + */ +static void tidesdb_cache_evict_block(void *payload, const size_t payload_len) +{ + (void)payload; + (void)payload_len; +} + +/** + * tidesdb_block_cache_key + * generate a cache key for a block + * @param cf_name column family name + * @param klog_filename filename portion of klog path (past last separator) + * @param block_position position of block in klog + * @param key_buffer buffer to store the cache key + * @param buffer_size size of key_buffer + * @return length of the generated key, 0 on error + * + * format "cf_name:filename:block_position" + * example "users:L2P3_1336.klog:0", "users:L2P3_1337.klog:65536" + * eses filename instead of full path for shorter cache keys + */ +static size_t tidesdb_block_cache_key(const char *cf_name, const char *klog_filename, + const uint64_t block_position, char *key_buffer, + const size_t buffer_size) +{ + if (!cf_name || !klog_filename || !key_buffer || buffer_size == 0) return 0; + + const char *filename = klog_filename; + + /* fast path -- memcpy + hex encode instead of snprintf + * format is "cf_namefilenameXXXXXXXXXXXXXXXX" (TDB_CACHE_KEY_HEX_DIGITS chars for + * uint64) */ + const size_t cf_len = strlen(cf_name); + const size_t fn_len = strlen(filename); + const size_t needed = cf_len + 1 + fn_len + 1 + TDB_CACHE_KEY_HEX_DIGITS; + if (needed >= buffer_size) return 0; + + char *p = key_buffer; + memcpy(p, cf_name, cf_len); + p += cf_len; + *p++ = TDB_CACHE_KEY_SEPARATOR; + memcpy(p, filename, fn_len); + p += fn_len; + *p++ = TDB_CACHE_KEY_SEPARATOR; + + /* we encode block_position as TDB_CACHE_KEY_HEX_DIGITS hex chars (avoids costly integer + * formatting) */ + static const char hex_chars[] = "0123456789abcdef"; + uint64_t pos = block_position; + for (int i = TDB_CACHE_KEY_HEX_DIGITS - 1; i >= 0; i--) + { + p[i] = hex_chars[pos & 0xF]; + pos >>= 4; + } + p += TDB_CACHE_KEY_HEX_DIGITS; + *p = '\0'; + + return (size_t)(p - key_buffer); +} + +/** + * tidesdb_cache_raw_block_put + * caches raw block bytes (compressed or uncompressed) directly in the clock cache. + * raw bytes are stored inline -- no ref counting needed, no deserialization overhead. + * @param db the database + * @param cf_name column family name + * @param klog_filename filename portion of klog path (past last separator) + * @param block_position position of block in file + * @param block_data raw block bytes (from pread, before or after decompression) + * @param block_size size of block data + * @return 0 on success, -1 on failure + */ +static int tidesdb_cache_raw_block_put(tidesdb_t *db, const char *cf_name, + const char *klog_filename, const uint64_t block_position, + const void *block_data, const size_t block_size) +{ + if (!db || !db->clock_cache || !cf_name || !klog_filename || !block_data || block_size == 0) + return -1; + + char cache_key[TDB_CACHE_KEY_SIZE]; + const size_t key_len = tidesdb_block_cache_key(cf_name, klog_filename, block_position, + cache_key, sizeof(cache_key)); + if (key_len == 0) return -1; + + /* we cache the raw bytes directly -- clock_cache copies them inline. + * use put_new since we just did a cache miss lookup (key is absent). */ + return clock_cache_put_new(db->clock_cache, cache_key, key_len, block_data, block_size, 0); +} + +/** + * tidesdb_cache_raw_block_get_pinned + * zero-copy cache access -- returns a direct pointer into the cache entry's payload + * without malloc or memcpy. the cache entry is pinned (reader ref held) so it + * cannot be evicted while the caller uses the data. caller must call + * clock_cache_release(*pin_out) when done with the returned pointer. + * @param db the database + * @param cf_name column family name + * @param klog_filename filename portion of klog path + * @param block_position position of block in file + * @param out_size output parameter for the size of the returned data + * @param pin_out output parameter for cache entry handle (caller must release) + * @return const pointer into cache payload, or NULL on miss + */ +static const uint8_t *tidesdb_cache_raw_block_get_pinned(tidesdb_t *db, const char *cf_name, + const char *klog_filename, + const uint64_t block_position, + size_t *out_size, + clock_cache_entry_t **pin_out) +{ + if (!db || !db->clock_cache || !cf_name || !klog_filename || !out_size || !pin_out) return NULL; + + char cache_key[TDB_CACHE_KEY_SIZE]; + const size_t key_len = tidesdb_block_cache_key(cf_name, klog_filename, block_position, + cache_key, sizeof(cache_key)); + if (key_len == 0) return NULL; + + size_t payload_len = 0; + clock_cache_entry_t *entry = NULL; + const uint8_t *data = + clock_cache_get_zero_copy(db->clock_cache, cache_key, key_len, &payload_len, &entry); + if (!data || payload_len == 0) + { + return NULL; + } + + *out_size = payload_len; + *pin_out = entry; + return data; +} + +/** + * tidesdb_get_cf_name_from_path + * extracts column family name from sstable path + * @param path the sstable path (e.g., "/path/to/cf_name/L2P3_1337.klog") + * @param cf_name_out buffer to store CF name (must be at least TDB_CACHE_KEY_SIZE bytes) + * @return 0 on success, -1 on failure + * + * this method handles both '/' and '\\' separators for cross-platform portability. + * a database created on linux (using '/') must be readable on windows (using '\\') and vice versa. + */ +static int tidesdb_get_cf_name_from_path(const char *path, char *cf_name_out) +{ + if (!path || !cf_name_out) return -1; + const char sep_unix = '/'; + const char sep_windows = '\\'; + + /* we find the last directory separator (we check both types for portability) */ + const char *last_slash = strrchr(path, sep_unix); + const char *last_backslash = strrchr(path, sep_windows); + const char *last_sep = (last_slash > last_backslash) ? last_slash : last_backslash; + if (!last_sep) return -1; + + /* we find the second-to-last directory separator */ + const char *second_last_sep = last_sep - 1; + while (second_last_sep > path && *second_last_sep != sep_unix && + *second_last_sep != sep_windows) + { + second_last_sep--; + } + + if (*second_last_sep != sep_unix && *second_last_sep != sep_windows) return -1; + size_t cf_name_len = last_sep - second_last_sep - 1; + if (cf_name_len >= TDB_CACHE_KEY_SIZE) cf_name_len = TDB_CACHE_KEY_SIZE - 1; + + memcpy(cf_name_out, second_last_sep + 1, cf_name_len); + cf_name_out[cf_name_len] = '\0'; + + return 0; +} + +/** + * tidesdb_read_block + * reads and decompresses a block from disk + * @param db the database + * @param sst the sstable (for compression config) + * @param cursor the block manager cursor + * @return the decompressed block if successful, NULL otherwise + */ +static block_manager_block_t *tidesdb_read_block(tidesdb_t *db, tidesdb_sstable_t *sst, + block_manager_cursor_t *cursor) +{ + if (!db || !sst || !cursor) return NULL; + + block_manager_block_t *block = block_manager_cursor_read(cursor); + if (!block) return NULL; + + if (sst->config && sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + uint8_t *decompressed = decompress_data(block->data, block->size, &decompressed_size, + sst->config->compression_algorithm); + if (decompressed) + { + /* we replace compressed data with decompressed data in the block. + * skip free if data is inline-allocated with the block struct. */ + if (!block->inline_data) free(block->data); + block->data = decompressed; + block->size = decompressed_size; + block->inline_data = 0; + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Decompression failed for SSTable %s (id=%" PRIu64 + ") " + "compression=%u block_size=%zu", + sst->klog_path ? sst->klog_path : "unknown", sst->id, + (unsigned int)sst->config->compression_algorithm, (size_t)block->size); + block_manager_block_release(block); + return NULL; + } + } + + return block; +} + +/** + * tidesdb_read_block_and_advance + * reads, decompresses a block from disk, and advances cursor in one operation + * more efficient than tidesdb_read_block + cursor_next as it avoids redundant pread + * @param db the database + * @param sst the sstable (for compression config) + * @param cursor the block manager cursor (will be advanced) + * @return the decompressed block if successful, NULL otherwise + */ +static block_manager_block_t *tidesdb_read_block_and_advance(tidesdb_t *db, tidesdb_sstable_t *sst, + block_manager_cursor_t *cursor) +{ + if (!db || !sst || !cursor) return NULL; + + block_manager_block_t *block = block_manager_cursor_read_and_advance(cursor); + if (!block) return NULL; + + if (sst->config && sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + uint8_t *decompressed = decompress_data(block->data, block->size, &decompressed_size, + sst->config->compression_algorithm); + if (decompressed) + { + if (!block->inline_data) free(block->data); + block->data = decompressed; + block->size = decompressed_size; + block->inline_data = 0; + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Decompression failed for SSTable %s (id=%" PRIu64 + ") compression=%u block_size=%zu", + sst->klog_path ? sst->klog_path : "unknown", sst->id, + (unsigned int)sst->config->compression_algorithm, (size_t)block->size); + block_manager_block_release(block); + return NULL; + } + } + + return block; +} + +/** + * tidesdb_check_disk_space + * check if theres enough free disk space using cached value + * refreshes cache every DISK_SPACE_CHECK_INTERVAL_SECONDS seconds to avoid expensive statvfs calls + * @param db database handle + * @param path directory path to check + * @param min_required minimum required free space in bytes + * @return 1 if enough space, 0 if not enough, -1 on error + */ +static int tidesdb_check_disk_space(tidesdb_t *db, const char *path, uint64_t min_required) +{ + if (!db) return -1; + + time_t now = atomic_load_explicit(&db->cached_current_time, memory_order_relaxed); + time_t last_check = atomic_load_explicit(&db->last_disk_space_check, memory_order_relaxed); + + if (now - last_check >= TDB_DISK_SPACE_CHECK_INTERVAL_SECONDS) + { + uint64_t available; + if (tdb_get_available_disk_space(path, &available) == 0) + { + atomic_store_explicit(&db->cached_available_disk_space, available, + memory_order_relaxed); + atomic_store_explicit(&db->last_disk_space_check, now, memory_order_relaxed); + } + else + { + return -1; + } + } + + uint64_t available = + atomic_load_explicit(&db->cached_available_disk_space, memory_order_relaxed); + return (available >= min_required) ? 1 : 0; +} + +/** + * tidesdb_validate_kv_size + * validates that a key-value pair size does not exceed memory limits + * maximum allowed size is max(resolved_memory_limit * TDB_MEMORY_PERCENTAGE, + * TDB_MIN_KEY_VALUE_SIZE) + * @param db database handle + * @param key_size size of key in bytes + * @param value_size size of value in bytes + * @return 0 if valid, TDB_ERR_MEMORY_LIMIT if too large + */ +static int tidesdb_validate_kv_size(tidesdb_t *db, const size_t key_size, const size_t value_size) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + + /* we enforce architectural limit! all sizes are uint32_t */ + if (key_size > TDB_MAX_KEY_VALUE_SIZE) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Key size (%zu bytes) exceeds TDB_MAX_KEY_VALUE_SIZE", + key_size); + return TDB_ERR_INVALID_ARGS; + } + if (value_size > TDB_MAX_KEY_VALUE_SIZE) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Value size (%zu bytes) exceeds TDB_MAX_KEY_VALUE_SIZE", + value_size); + return TDB_ERR_INVALID_ARGS; + } + + /* we check for overflow before doing addition */ + if (key_size > TDB_MAX_KEY_VALUE_SIZE - value_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "Total key+value size (key: %zu + value: %zu) exceeds TDB_MAX_KEY_VALUE_SIZE", + key_size, value_size); + return TDB_ERR_INVALID_ARGS; + } + + const size_t total_size = key_size + value_size; + + /* we use resolved_memory_limit (stable, periodically enforced by reaper) instead of + * available_memory which is a stale snapshot from open time and drifts over the DB lifetime */ + const size_t mem_limit = atomic_load_explicit(&db->resolved_memory_limit, memory_order_relaxed); + const uint64_t memory_based_limit = (uint64_t)((double)mem_limit * TDB_MEMORY_PERCENTAGE); + const uint64_t max_allowed_size = + memory_based_limit > TDB_MIN_KEY_VALUE_SIZE ? memory_based_limit : TDB_MIN_KEY_VALUE_SIZE; + + if (total_size > max_allowed_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "Key-value pair size (%zu bytes) exceeds memory limit (%" PRIu64 + " bytes, based on resolved memory limit: %zu bytes)", + total_size, max_allowed_size, mem_limit); + return TDB_ERR_MEMORY_LIMIT; + } + + return 0; +} + +/* the on-disk sstable metadata header is serialized/deserialized field-by-field with the + * encode_*_le_compat helpers in sstable_metadata_serialize / sstable_metadata_deserialize -- + * see those functions (and design/tidesdb_sstable_format.md S7.4) for the authoritative wire + * layout. a stale `sstable_metadata_header_t` struct used to live here but was never + * referenced and omitted two serialized fields (klog_data_end_offset, max_seq); removed. */ + +/* sstable metadata flags */ +#define SSTABLE_FLAG_BTREE 0x01 /* sstable uses btree format instead of klog blocks */ +#define SSTABLE_FLAG_TOMBSTONE_COUNT \ + 0x02 /* footer carries an 8-byte tombstone_count after the \ + * btree section (or after max_key when use_btree=0) \ + * and before the trailing checksum */ +#define SSTABLE_FLAG_CHUNKED_AUX \ + 0x04 /* footer carries a 32-byte chunked-aux descriptor (bloom blob \ + * offset+size, index blob offset+size) after tombstone_count. \ + * present when a bloom/index blob spans multiple blocks; absent \ + * sstables locate bloom/index by trailing-block navigation */ + +/** + * sstable_metadata_serialize + * @param sst sstable to serialize + * @param out_data output data + * @param out_size output size + * @return 0 on success, -1 on failure + */ +static int sstable_metadata_serialize(tidesdb_sstable_t *sst, uint8_t **out_data, size_t *out_size) +{ + if (!sst || !out_data || !out_size) return -1; + + /* we calculate size -- header + keys + btree metadata (if applicable) + tombstone count + * + checksum */ + const size_t header_size = TDB_SSTABLE_METADATA_HEADER_SIZE; + const size_t checksum_size = TDB_SSTABLE_METADATA_CHECKSUM_SIZE; + + size_t btree_meta_size = 0; + if (sst->use_btree) + { + btree_meta_size = TDB_SSTABLE_METADATA_BTREE_SIZE; + } + + const size_t tombstone_meta_size = TDB_SSTABLE_METADATA_TOMBSTONE_SIZE; + + const size_t chunked_aux_size = sst->aux_chunked ? TDB_SSTABLE_METADATA_CHUNKED_AUX_SIZE : 0; + + const size_t total_size = header_size + sst->min_key_size + sst->max_key_size + + btree_meta_size + tombstone_meta_size + chunked_aux_size + + checksum_size; + + uint8_t *data = malloc(total_size); + if (!data) return -1; + + uint8_t *ptr = data; + + /* we serialize fields with explicit little-endian encoding */ + encode_uint32_le_compat(ptr, TDB_SSTABLE_METADATA_MAGIC); + ptr += 4; + encode_uint64_le_compat(ptr, sst->num_entries); + ptr += 8; + encode_uint64_le_compat(ptr, sst->num_klog_blocks); + ptr += 8; + encode_uint64_le_compat(ptr, sst->num_vlog_blocks); + ptr += 8; + encode_uint64_le_compat(ptr, sst->klog_data_end_offset); + ptr += 8; + encode_uint64_le_compat(ptr, sst->klog_size); + ptr += 8; + encode_uint64_le_compat(ptr, sst->vlog_size); + ptr += 8; + encode_uint64_le_compat(ptr, sst->min_key_size); + ptr += 8; + encode_uint64_le_compat(ptr, sst->max_key_size); + ptr += 8; + encode_uint64_le_compat(ptr, sst->max_seq); /* maximum sequence number */ + ptr += 8; + encode_uint32_le_compat(ptr, sst->config->compression_algorithm); + ptr += 4; + + /* flags field -- we set SSTABLE_FLAG_BTREE if using btree, and always set + * SSTABLE_FLAG_TOMBSTONE_COUNT for sstables produced by this build */ + uint32_t flags = SSTABLE_FLAG_TOMBSTONE_COUNT; + if (sst->use_btree) + { + flags |= SSTABLE_FLAG_BTREE; + } + if (sst->aux_chunked) + { + flags |= SSTABLE_FLAG_CHUNKED_AUX; + } + encode_uint32_le_compat(ptr, flags); + ptr += 4; + + if (sst->min_key && sst->min_key_size > 0) + { + memcpy(ptr, sst->min_key, sst->min_key_size); + ptr += sst->min_key_size; + } + if (sst->max_key && sst->max_key_size > 0) + { + memcpy(ptr, sst->max_key, sst->max_key_size); + ptr += sst->max_key_size; + } + + /* btree metadata (if applicable) */ + if (sst->use_btree) + { + encode_int64_le_compat(ptr, sst->btree_root_offset); + ptr += 8; + encode_int64_le_compat(ptr, sst->btree_first_leaf); + ptr += 8; + encode_int64_le_compat(ptr, sst->btree_last_leaf); + ptr += 8; + encode_uint64_le_compat(ptr, sst->btree_node_count); + ptr += 8; + encode_uint32_le_compat(ptr, sst->btree_height); + ptr += 4; + } + + encode_uint64_le_compat(ptr, sst->tombstone_count); + ptr += 8; + + /* chunked-aux descriptor (only when SSTABLE_FLAG_CHUNKED_AUX is set) */ + if (sst->aux_chunked) + { + encode_uint64_le_compat(ptr, sst->bloom_blob_offset); + ptr += 8; + encode_uint64_le_compat(ptr, sst->bloom_blob_size); + ptr += 8; + encode_uint64_le_compat(ptr, sst->index_blob_offset); + ptr += 8; + encode_uint64_le_compat(ptr, sst->index_blob_size); + ptr += 8; + } + + /* we compute and append checksum over everything except the checksum field itself */ + const size_t checksum_data_size = total_size - checksum_size; + const uint64_t checksum = XXH64(data, checksum_data_size, 0); + encode_uint64_le_compat(ptr, checksum); + + *out_data = data; + *out_size = total_size; + return 0; +} + +/** + * sstable_metadata_deserialize + * deserialize sstable metadata + * @param data data to deserialize + * @param data_size data size + * @param sst sstable to deserialize + * @return 0 on success, -1 on failure + */ +static int sstable_metadata_deserialize(const uint8_t *data, const size_t data_size, + tidesdb_sstable_t *sst) +{ + if (!data || !sst || data_size < TDB_SSTABLE_METADATA_FIXED_SIZE) return -1; + + const uint8_t *ptr = data; + + const uint32_t magic = decode_uint32_le_compat(ptr); + ptr += 4; + + if (magic != TDB_SSTABLE_METADATA_MAGIC) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "SSTable metadata has an invalid magic 0x%08x (expected 0x%08x)", magic, + TDB_SSTABLE_METADATA_MAGIC); + return -1; + } + + const uint64_t num_entries = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t num_klog_blocks = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t num_vlog_blocks = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t klog_data_end_offset = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t klog_size = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t vlog_size = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t min_key_size = decode_uint64_le_compat(ptr); + ptr += 8; + const uint64_t max_key_size = decode_uint64_le_compat(ptr); + ptr += 8; + + const uint64_t max_seq = decode_uint64_le_compat(ptr); + ptr += 8; + + const uint32_t compression_algorithm = decode_uint32_le_compat(ptr); + ptr += 4; + + const uint32_t flags = decode_uint32_le_compat(ptr); + ptr += 4; + + const int use_btree = (flags & SSTABLE_FLAG_BTREE) ? 1 : 0; + const int has_tombstone_count = (flags & SSTABLE_FLAG_TOMBSTONE_COUNT) ? 1 : 0; + const int has_chunked_aux = (flags & SSTABLE_FLAG_CHUNKED_AUX) ? 1 : 0; + + /* we calculate expected size based on which optional sections the flags promise */ + size_t btree_meta_size = 0; + + if (use_btree) + { + btree_meta_size = TDB_SSTABLE_METADATA_BTREE_SIZE; + } + + const size_t tombstone_meta_size = + has_tombstone_count ? TDB_SSTABLE_METADATA_TOMBSTONE_SIZE : 0; + + const size_t chunked_aux_size = has_chunked_aux ? TDB_SSTABLE_METADATA_CHUNKED_AUX_SIZE : 0; + + const size_t expected_size = TDB_SSTABLE_METADATA_FIXED_SIZE + min_key_size + max_key_size + + btree_meta_size + tombstone_meta_size + chunked_aux_size; + if (data_size != expected_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "SSTable metadata size mismatch (expected: %zu, got: %zu)", + expected_size, data_size); + return -1; + } + + /* we verify checksum over everything except checksum field */ + const size_t checksum_data_size = data_size - TDB_SSTABLE_METADATA_CHECKSUM_SIZE; + const uint64_t computed_checksum = XXH64(data, checksum_data_size, 0); + + /* we checksum is at the end of the data */ + const uint8_t *checksum_ptr = data + data_size - TDB_SSTABLE_METADATA_CHECKSUM_SIZE; + const uint64_t stored_checksum = decode_uint64_le_compat(checksum_ptr); + + if (computed_checksum != stored_checksum) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "SSTable metadata checksum mismatch (expected: %" PRIu64 ", got: %" PRIu64 + ")", + stored_checksum, computed_checksum); + return -1; + } + + sst->num_entries = num_entries; + sst->num_klog_blocks = num_klog_blocks; + sst->num_vlog_blocks = num_vlog_blocks; + sst->klog_data_end_offset = klog_data_end_offset; + sst->klog_size = klog_size; + sst->vlog_size = vlog_size; + sst->max_seq = max_seq; /* assign recovered max sequence number */ + sst->use_btree = use_btree; + + /* we restore compression algorithm from metadata */ + if (sst->config) + { + /* we validate compression algorithm value */ + if (compression_algorithm != TDB_COMPRESS_NONE && +#ifndef __sun + compression_algorithm != TDB_COMPRESS_SNAPPY && +#endif + compression_algorithm != TDB_COMPRESS_LZ4 && + compression_algorithm != TDB_COMPRESS_LZ4_FAST && + compression_algorithm != TDB_COMPRESS_ZSTD) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable metadata has invalid compression_algorithm: %u", + compression_algorithm); + return -1; + } + sst->config->compression_algorithm = compression_algorithm; + } + + /* we read keys */ + if (min_key_size > 0) + { + sst->min_key = malloc(min_key_size); + if (!sst->min_key) return -1; + memcpy(sst->min_key, ptr, min_key_size); + sst->min_key_size = min_key_size; + ptr += min_key_size; + } + + if (max_key_size > 0) + { + sst->max_key = malloc(max_key_size); + if (!sst->max_key) + { + free(sst->min_key); + sst->min_key = NULL; + sst->min_key_size = 0; + return -1; + } + memcpy(sst->max_key, ptr, max_key_size); + sst->max_key_size = max_key_size; + ptr += max_key_size; + } + + /* we read btree metadata if present */ + if (use_btree) + { + sst->btree_root_offset = decode_int64_le_compat(ptr); + ptr += 8; + sst->btree_first_leaf = decode_int64_le_compat(ptr); + ptr += 8; + sst->btree_last_leaf = decode_int64_le_compat(ptr); + ptr += 8; + sst->btree_node_count = decode_uint64_le_compat(ptr); + ptr += 8; + sst->btree_height = decode_uint32_le_compat(ptr); + ptr += 4; + } + + if (has_tombstone_count) + { + sst->tombstone_count = decode_uint64_le_compat(ptr); + ptr += 8; + } + else + { + sst->tombstone_count = TDB_TOMBSTONE_COUNT_UNKNOWN; + } + + if (has_chunked_aux) + { + sst->aux_chunked = 1; + sst->bloom_blob_offset = decode_uint64_le_compat(ptr); + ptr += 8; + sst->bloom_blob_size = decode_uint64_le_compat(ptr); + ptr += 8; + sst->index_blob_offset = decode_uint64_le_compat(ptr); + ptr += 8; + sst->index_blob_size = decode_uint64_le_compat(ptr); + ptr += 8; + } + else + { + sst->aux_chunked = 0; + } + + return 0; +} + +/** + * tidesdb_resolve_comparator + * resolves a comparator function and context from config using the registry + * @param db database handle + * @param config column family config + * @param fn output parameter for comparator function + * @param ctx output parameter for comparator context + * @return 0 on success, -1 if comparator not found + */ +static int tidesdb_resolve_comparator(tidesdb_t *db, const tidesdb_column_family_config_t *config, + skip_list_comparator_fn *fn, void **ctx) +{ + if (!db || !config || !fn) return -1; + + if (config->comparator_fn_cached) + { + *fn = config->comparator_fn_cached; + if (ctx) *ctx = config->comparator_ctx_cached; + return 0; + } + + /* if we reach here, cached comparator is NULL but we need to resolve it */ + const int has_custom_comparator = + (config->comparator_name[0] != '\0' && strcmp(config->comparator_name, "memcmp") != 0); + + if (tidesdb_get_comparator(db, config->comparator_name, fn, ctx) != TDB_SUCCESS) + { + if (has_custom_comparator) + { + /* custom comparator specified but not in registry and not cached! + * this should never happen if CF creation validated properly. + * */ + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Comparator '%s' not found in registry and not cached", + config->comparator_name); + return -1; + } + + /* no comparator specified or explicitly requested memcmp, we use default */ + *fn = skip_list_comparator_memcmp; + if (ctx) *ctx = NULL; + return 0; + } + + return 0; +} + +int tidesdb_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + /* fast path -- equal size keys (most common case) */ + if (TDB_LIKELY(key1_size == key2_size)) + { + return memcmp(key1, key2, key1_size); + } + + /* slow path -- different size keys */ + const size_t min_size = key1_size < key2_size ? key1_size : key2_size; + const int cmp = memcmp(key1, key2, min_size); + if (cmp != 0) return cmp; + return (key1_size < key2_size) ? -1 : 1; +} + +int tidesdb_comparator_lexicographic(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + (void)key1_size; + (void)key2_size; + return strcmp((const char *)key1, (const char *)key2); +} + +int tidesdb_comparator_uint64(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + if (key1_size != 8 || key2_size != 8) + { + /* fallback to memcmp if sizes are wrong */ + return tidesdb_comparator_memcmp(key1, key1_size, key2, key2_size, NULL); + } + + uint64_t val1, val2; + memcpy(&val1, key1, 8); + memcpy(&val2, key2, 8); + + if (val1 < val2) return -1; + if (val1 > val2) return 1; + return 0; +} + +int tidesdb_comparator_int64(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + if (key1_size != 8 || key2_size != 8) + { + /* fallback to memcmp if sizes are wrong */ + return tidesdb_comparator_memcmp(key1, key1_size, key2, key2_size, NULL); + } + + int64_t val1, val2; + memcpy(&val1, key1, 8); + memcpy(&val2, key2, 8); + + if (val1 < val2) return -1; + if (val1 > val2) return 1; + return 0; +} + +int tidesdb_comparator_reverse_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + /* reverse the comparison result */ + return -tidesdb_comparator_memcmp(key1, key1_size, key2, key2_size, ctx); +} + +int tidesdb_comparator_case_insensitive(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx) +{ + (void)ctx; + const size_t min_size = key1_size < key2_size ? key1_size : key2_size; + + for (size_t i = 0; i < min_size; i++) + { + unsigned char c1 = key1[i]; + unsigned char c2 = key2[i]; + + /* we convert to lowercase for ASCII characters */ + if (c1 >= 'A' && c1 <= 'Z') c1 = c1 + ('a' - 'A'); + if (c2 >= 'A' && c2 <= 'Z') c2 = c2 + ('a' - 'A'); + + if (c1 < c2) return -1; + if (c1 > c2) return 1; + } + + if (key1_size < key2_size) return -1; + if (key1_size > key2_size) return 1; + return 0; +} + +tidesdb_column_family_config_t tidesdb_default_column_family_config(void) +{ + return (tidesdb_column_family_config_t){ + .write_buffer_size = TDB_DEFAULT_WRITE_BUFFER_SIZE, + .level_size_ratio = TDB_DEFAULT_LEVEL_SIZE_RATIO, + .min_levels = TDB_DEFAULT_MIN_LEVELS, + .dividing_level_offset = TDB_DEFAULT_DIVIDING_LEVEL_OFFSET, + .klog_value_threshold = TDB_DEFAULT_KLOG_VALUE_THRESHOLD, + .compression_algorithm = TDB_COMPRESS_LZ4, + .enable_bloom_filter = 1, + .bloom_fpr = TDB_DEFAULT_BLOOM_FPR, + .enable_block_indexes = 1, + .index_sample_ratio = TDB_DEFAULT_INDEX_SAMPLE_RATIO, + .block_index_prefix_len = TDB_DEFAULT_BLOCK_INDEX_PREFIX_LEN, + .sync_mode = TDB_SYNC_NONE, + .sync_interval_us = TDB_DEFAULT_SYNC_INTERVAL_US, + .comparator_fn_cached = NULL, + .comparator_ctx_cached = NULL, + .skip_list_max_level = TDB_SKIP_LIST_MAX_LEVEL, + .skip_list_probability = TDB_SKIP_LIST_PROBABILITY, + .default_isolation_level = TDB_ISOLATION_READ_COMMITTED, + .min_disk_space = TDB_DEFAULT_MIN_DISK_SPACE, + .l1_file_count_trigger = TDB_DEFAULT_L1_FILE_COUNT_TRIGGER, + .l0_queue_stall_threshold = TDB_DEFAULT_L0_QUEUE_STALL_THRESHOLD, + .tombstone_density_trigger = TDB_DEFAULT_TOMBSTONE_DENSITY_TRIGGER, + .tombstone_density_min_entries = TDB_DEFAULT_TOMBSTONE_DENSITY_MIN_ENTRIES, + .use_btree = 0, + .commit_hook_fn = NULL, + .commit_hook_ctx = NULL, + .object_target_file_size = 0, /* reserved, not used */ + .object_lazy_compaction = 0, + .object_prefetch_compaction = 1}; +} + +tidesdb_config_t tidesdb_default_config(void) +{ + return (tidesdb_config_t){.db_path = "./tidesdb", + .log_level = TDB_LOG_INFO, + .num_flush_threads = TDB_DEFAULT_FLUSH_THREAD_POOL_SIZE, + .num_compaction_threads = TDB_DEFAULT_COMPACTION_THREAD_POOL_SIZE, + .block_cache_size = TDB_DEFAULT_BLOCK_CACHE_SIZE, + .max_open_sstables = TDB_DEFAULT_MAX_OPEN_SSTABLES, + .log_to_file = 0, + .log_truncation_at = TDB_DEFAULT_LOG_FILE_TRUNCATION, + .max_memory_usage = 0, + .unified_memtable = 0, + .unified_memtable_write_buffer_size = 0, + .unified_memtable_skip_list_max_level = 0, + .unified_memtable_skip_list_probability = 0, + .unified_memtable_sync_mode = 0, + .unified_memtable_sync_interval_us = 0, + .object_store = NULL, + .object_store_config = NULL, + .max_concurrent_flushes = TDB_DEFAULT_MAX_CONCURRENT_FLUSHES}; +} + +/** + * create a new KV pair + * @param key key + * @param key_size key size + * @param value value + * @param value_size value size + * @param ttl time to live + * @param seq sequence number + * @param tombstone_flags bitmask of tombstone-related kv flags to set on the + * entry (TDB_KV_FLAG_TOMBSTONE, TDB_KV_FLAG_SINGLE_DELETE). + * bits outside that mask are ignored. passing 0 or 1 + * continues to behave as the previous bool-like argument. + * @return new KV pair + */ +static tidesdb_kv_pair_t *tidesdb_kv_pair_create(const uint8_t *key, const size_t key_size, + const uint8_t *value, const size_t value_size, + const time_t ttl, const uint64_t seq, + const uint8_t tombstone_flags) +{ + /* arena allocation -- single malloc for struct + key + value + * [tidesdb_kv_pair_t][key_data][value_data] + * this reduces malloc calls from 3 to 1, improves cache locality! */ + const size_t value_alloc = (value_size > 0 && value) ? value_size : 0; + const size_t arena_size = sizeof(tidesdb_kv_pair_t) + key_size + value_alloc; + + uint8_t *arena = malloc(arena_size); + if (!arena) return NULL; + + tidesdb_kv_pair_t *kv = (tidesdb_kv_pair_t *)arena; + + kv->entry.flags = + (tombstone_flags & (TDB_KV_FLAG_TOMBSTONE | TDB_KV_FLAG_SINGLE_DELETE)) | TDB_KV_FLAG_ARENA; + kv->entry.key_size = (uint32_t)key_size; + kv->entry.value_size = (uint32_t)value_size; + kv->entry.ttl = ttl; + kv->entry.seq = seq; + kv->entry.vlog_offset = 0; + kv->value = NULL; + + /* key immediately follows struct */ + kv->key = arena + sizeof(tidesdb_kv_pair_t); + memcpy(kv->key, key, key_size); + + /* value follows key */ + if (value_alloc > 0) + { + kv->value = kv->key + key_size; + memcpy(kv->value, value, value_size); + } + + return kv; +} + +/** + * tidesdb_kv_pair_free + * free a KV pair + * @param kv KV pair to free + */ +static void tidesdb_kv_pair_free(tidesdb_kv_pair_t *kv) +{ + if (!kv) return; + + /* borrowed kv pairs point into block data -- nothing to free */ + if (kv->entry.flags & TDB_KV_FLAG_BORROWED) return; + + /* pop buffer kv pairs live in reusable heap arena -- nothing to free */ + if (kv->entry.flags & TDB_KV_FLAG_POP_BUF) return; + + /* arena-allocated KV pairs use single allocation for struct + key + value + * however, value may be loaded separately (e.g., from vlog) after creation + * [struct][key_data][value_data_if_included] + * if value was included in arena, it points to exactly kv->key + key_size */ + if (kv->entry.flags & TDB_KV_FLAG_ARENA) + { + if (kv->value != NULL) + { + /* value is in arena only if it points to key + key_size + * otherwise it was allocated separately and must be freed */ + const uint8_t *expected_arena_value = kv->key + kv->entry.key_size; + if (kv->value != expected_arena_value) + { + free(kv->value); /* value was allocated separately */ + } + } + + free(kv); /* single free for arena (struct + key + maybe value) */ + return; + } + + free(kv->key); + free(kv->value); + free(kv); +} + +/** + * tidesdb_kv_arena_alloc + * bump-allocate size bytes (8-byte aligned) from the arena + * @param a the arena + * @param size number of bytes + * @return pointer to the allocation, NULL on out of memory + */ +static uint8_t *tidesdb_kv_arena_alloc(tidesdb_kv_arena_t *a, size_t size) +{ + const size_t need = (size + (TDB_KLOG_ARENA_ALIGN - 1)) & ~(size_t)(TDB_KLOG_ARENA_ALIGN - 1); + + /* current chunk has room */ + if (a->count > 0 && a->off + need <= a->sizes[a->cur]) + { + uint8_t *p = a->chunks[a->cur] + a->off; + a->off += need; + return p; + } + + /* reuse the next already-allocated chunk if it fits (common after a reset) */ + if (a->cur + 1 < a->count && need <= a->sizes[a->cur + 1]) + { + a->cur++; + a->off = need; + return a->chunks[a->cur]; + } + + /* grow -- append a new chunk. existing chunks are never moved so live pointers hold */ + if (a->count == a->cap) + { + const int nc = a->cap ? a->cap * 2 : TDB_KLOG_ARENA_INIT_CHUNKS; + uint8_t **nch = realloc(a->chunks, (size_t)nc * sizeof(uint8_t *)); + if (!nch) return NULL; + a->chunks = nch; + size_t *nsz = realloc(a->sizes, (size_t)nc * sizeof(size_t)); + if (!nsz) return NULL; + a->sizes = nsz; + a->cap = nc; + } + + const size_t csz = need > TDB_KLOG_ARENA_CHUNK ? need : TDB_KLOG_ARENA_CHUNK; + uint8_t *chunk = malloc(csz); + if (!chunk) return NULL; + a->chunks[a->count] = chunk; + a->sizes[a->count] = csz; + a->cur = a->count; + a->count++; + a->off = need; + return chunk; +} + +/** + * tidesdb_kv_arena_reset + * rewinds the arena for reuse on the next block, keeping chunks allocated + * @param a the arena + */ +static void tidesdb_kv_arena_reset(tidesdb_kv_arena_t *a) +{ + a->cur = 0; + a->off = 0; +} + +/** + * tidesdb_kv_arena_destroy + * frees all chunks and bookkeeping arrays + * @param a the arena + */ +static void tidesdb_kv_arena_destroy(tidesdb_kv_arena_t *a) +{ + for (int i = 0; i < a->count; i++) free(a->chunks[i]); + free(a->chunks); + free(a->sizes); + a->chunks = NULL; + a->sizes = NULL; + a->count = a->cap = a->cur = 0; + a->off = 0; +} + +/** + * tidesdb_klog_block_create + * create a new klog block + * @return new klog block + */ +static tidesdb_klog_block_t *tidesdb_klog_block_create(void) +{ + tidesdb_klog_block_t *block = calloc(1, sizeof(tidesdb_klog_block_t)); + if (!block) return NULL; + + /* we pre-allocate for expected entries per block + * with 64KB blocks and ~116 byte entries, expect ~560 entries + * we pre-allocate to avoid realloc in common case */ + const uint32_t initial_capacity = TDB_KLOG_BLOCK_INITIAL_CAPACITY; + + block->entries = malloc(initial_capacity * sizeof(tidesdb_klog_entry_t)); + block->keys = malloc(initial_capacity * sizeof(uint8_t *)); + block->inline_values = malloc(initial_capacity * sizeof(uint8_t *)); + block->capacity = initial_capacity; /* track allocated capacity */ + + if (!block->entries || !block->keys || !block->inline_values) + { + free(block->entries); + free(block->keys); + free(block->inline_values); + free(block); + return NULL; + } + + /* we init pointers to NULL for safety */ + memset(block->keys, 0, initial_capacity * sizeof(uint8_t *)); + memset(block->inline_values, 0, initial_capacity * sizeof(uint8_t *)); + + /* mark as not arena-allocated (separate mallocs) */ + block->is_arena_allocated = 0; + + return block; +} + +/** + * tidesdb_klog_block_free + * free a klog block + * @param block klog block to free + */ +static void tidesdb_klog_block_free(tidesdb_klog_block_t *block) +{ + if (!block) return; + + if (block->is_arena_allocated) + { + /* with arena allocation everything is in one contiguous block + * except max_key which is allocated separately during deserialization. + * for zero-copy blocks, also free the owned data buffer if present. */ + free(block->max_key); + if (block->is_zero_copy && block->data_ref) + { + free(block->data_ref); + } + free(block); + } + else + { + /* per-entry key/value copies live in the bump arena -- released in one shot */ + tidesdb_kv_arena_destroy(&block->kv_arena); + free(block->entries); + free(block->keys); + free(block->inline_values); + free(block->max_key); + free(block); + } +} + +/** + * tidesdb_klog_block_reset + * rewinds a klog block for reuse as the next block in a flush or merge -- clears the + * entry count and bump arena while keeping the arrays and chunks allocated, avoiding a + * free/create cycle per block + * @param block klog block to reset + */ +static void tidesdb_klog_block_reset(tidesdb_klog_block_t *block) +{ + if (!block) return; + tidesdb_kv_arena_reset(&block->kv_arena); + block->num_entries = 0; + block->block_size = 0; +} + +/** + * tidesdb_klog_block_add_entry + * add an entry to a klog block + * @param block klog block to add entry to + * @param kv KV pair to add + * @param config column family config + * @param comparator_fn pre-resolved comparator function (avoids repeated lookups) + * @param comparator_ctx pre-resolved comparator context + * @return 0 on success, -1 on error + */ +static int tidesdb_klog_block_add_entry(tidesdb_klog_block_t *block, const tidesdb_kv_pair_t *kv, + const tidesdb_column_family_config_t *config, + skip_list_comparator_fn comparator_fn, void *comparator_ctx) +{ + if (!block || !kv || !config || !comparator_fn) return -1; + + const int inline_value = (kv->entry.value_size < config->klog_value_threshold); + + /** we calculate actual entry size to match serialization + * we must use actual varint sizes, not max sizes, so block_size is accurate + */ + size_t entry_size = 1; /* flags */ + + /* we calculate actual varint sizes for key_size, value_size, seq */ + uint8_t temp_buf[TDB_VARINT_MAX_BYTES]; + entry_size += encode_varint(temp_buf, kv->entry.key_size); + entry_size += encode_varint(temp_buf, kv->entry.value_size); + entry_size += encode_varint(temp_buf, kv->entry.seq); + + if (kv->entry.ttl != 0) entry_size += 8; + if (kv->entry.vlog_offset != 0) + { + entry_size += encode_varint(temp_buf, kv->entry.vlog_offset); + } + + entry_size += kv->entry.key_size; + if (inline_value) + { + entry_size += kv->entry.value_size; + } + + const uint32_t new_count = block->num_entries + 1; + + if (new_count > block->capacity) + { + const uint32_t old_capacity = block->capacity; + const uint32_t new_capacity = old_capacity * 2; + + tidesdb_klog_entry_t *new_entries = + realloc(block->entries, new_capacity * sizeof(tidesdb_klog_entry_t)); + if (!new_entries) return TDB_ERR_MEMORY; + block->entries = new_entries; + + uint8_t **new_keys = realloc(block->keys, new_capacity * sizeof(uint8_t *)); + if (!new_keys) return TDB_ERR_MEMORY; + block->keys = new_keys; + + uint8_t **new_inline_values = + realloc(block->inline_values, new_capacity * sizeof(uint8_t *)); + if (!new_inline_values) return TDB_ERR_MEMORY; + block->inline_values = new_inline_values; + + const size_t new_elements = new_capacity - old_capacity; + memset(block->keys + old_capacity, 0, new_elements * sizeof(uint8_t *)); + memset(block->inline_values + old_capacity, 0, new_elements * sizeof(uint8_t *)); + + block->capacity = new_capacity; + } + + memcpy(&block->entries[block->num_entries], &kv->entry, sizeof(tidesdb_klog_entry_t)); + + block->keys[block->num_entries] = tidesdb_kv_arena_alloc(&block->kv_arena, kv->entry.key_size); + if (!block->keys[block->num_entries]) return TDB_ERR_MEMORY; + memcpy(block->keys[block->num_entries], kv->key, kv->entry.key_size); + + if (inline_value && kv->entry.value_size > 0) + { + block->inline_values[block->num_entries] = + tidesdb_kv_arena_alloc(&block->kv_arena, kv->entry.value_size); + if (!block->inline_values[block->num_entries]) return TDB_ERR_MEMORY; + memcpy(block->inline_values[block->num_entries], kv->value, kv->entry.value_size); + block->entries[block->num_entries].vlog_offset = 0; + } + else + { + block->inline_values[block->num_entries] = NULL; + } + + block->num_entries++; + block->block_size += (uint32_t)entry_size; + + /* we update max_key for seek using pre-resolved comparator */ + if (block->num_entries == 1 || comparator_fn(kv->key, kv->entry.key_size, block->max_key, + block->max_key_size, comparator_ctx) > 0) + { + if (kv->entry.key_size != block->max_key_size) + { + free(block->max_key); + block->max_key = malloc(kv->entry.key_size); + if (!block->max_key) + { + block->max_key_size = 0; + return TDB_ERR_MEMORY; + } + block->max_key_size = kv->entry.key_size; + } + memcpy(block->max_key, kv->key, kv->entry.key_size); + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_klog_block_is_full + * check if a klog block is full + * @param block klog block to check + * @param max_size maximum size of block + * @return 1 if block is full, 0 otherwise + * + * we use 2x max_size threshold because blocks are compressed before writing. + * ZSTD typically achieves 2-4x compression on structured data, so filling to 2x + * the target size ensures blocks are well-utilized after compression. + * + * 64KB target -> fill to 128KB uncompressed -> compresses to ~40-60KB + * this maximizes block density while staying under the target after compression. + */ +static int tidesdb_klog_block_is_full(const tidesdb_klog_block_t *block, const size_t max_size) +{ + if (!block || !max_size) return -1; + + return block->block_size >= (max_size * 2); +} + +/** + * tidesdb_klog_block_serialize + * @param block klog block to serialize + * @param out output buffer + * @param out_size output buffer size + * @return 0 on success, -1 on error + */ +static int tidesdb_klog_block_serialize(tidesdb_klog_block_t *block, uint8_t **out, + size_t *out_size) +{ + if (!block || !out || !out_size) return TDB_ERR_INVALID_ARGS; + + size_t estimated_size = 8; /* header -- num_entries + block_size */ + for (uint32_t i = 0; i < block->num_entries; i++) + { + /* flags(1) + key_size + value_size + seq, each a worst-case varint */ + estimated_size += 1 + TDB_VARINT_MAX_BYTES * 3; + + if (block->entries[i].ttl != 0) + { + estimated_size += sizeof(int64_t); + } + + if (block->entries[i].vlog_offset != 0) + { + estimated_size += TDB_VARINT_MAX_BYTES; + } + + /* key data */ + estimated_size += block->entries[i].key_size; + + /* inline value data only if not in vlog */ + if (block->entries[i].vlog_offset == 0) + { + estimated_size += block->entries[i].value_size; + } + } + + *out = malloc(estimated_size); + if (!*out) return TDB_ERR_MEMORY; + + uint8_t *ptr = *out; + const uint8_t *start = ptr; + + encode_uint32_le_compat(ptr, block->num_entries); + ptr += sizeof(uint32_t); + encode_uint32_le_compat(ptr, block->block_size); + ptr += sizeof(uint32_t); + + uint64_t prev_seq = 0; + + for (uint32_t i = 0; i < block->num_entries; i++) + { + const tidesdb_klog_entry_t *entry = &block->entries[i]; + uint8_t flags = entry->flags; + + uint64_t seq_value = entry->seq; + if (i > 0 && entry->seq > prev_seq && (entry->seq - prev_seq) < TDB_KLOG_DELTA_SEQ_MAX_DIFF) + { + flags |= TDB_KV_FLAG_DELTA_SEQ; + seq_value = entry->seq - prev_seq; + } + + if (entry->ttl != 0) flags |= TDB_KV_FLAG_HAS_TTL; + if (entry->vlog_offset != 0) flags |= TDB_KV_FLAG_HAS_VLOG; + + /* strip the in-memory-only bits (ARENA/BORROWED/POP_BUF) so they never reach disk -- + * kv_pair_create sets ARENA on every compaction-written kv. the HAS_TTL/HAS_VLOG checks + * below still see their bits since those are persistent. */ + flags &= TDB_KV_FLAG_PERSISTENT_MASK; + *ptr++ = flags; + + ptr += encode_varint(ptr, entry->key_size); + ptr += encode_varint(ptr, entry->value_size); + + ptr += encode_varint(ptr, seq_value); + + if (flags & TDB_KV_FLAG_HAS_TTL) + { + encode_int64_le_compat(ptr, entry->ttl); + ptr += sizeof(int64_t); + } + + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + ptr += encode_varint(ptr, entry->vlog_offset); + } + + memcpy(ptr, block->keys[i], entry->key_size); + ptr += entry->key_size; + + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && block->inline_values[i]) + { + memcpy(ptr, block->inline_values[i], entry->value_size); + ptr += entry->value_size; + } + + prev_seq = entry->seq; + } + + *out_size = ptr - start; + + if (*out_size > estimated_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "klog serialization buffer overrun! wrote %zu bytes, allocated %zu bytes", + *out_size, estimated_size); + free(*out); + *out = NULL; + return TDB_ERR_CORRUPTION; + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_build_indexed_block_data + * builds a key offset index with pre-computed absolute sequence numbers + * and prepends it to decompressed block data. + * the indexed format allows search_raw to skip both the O(N) linear scan + * and the O(found) delta-seq reconstruction loop on cache hits. + * + * [magic_v2:u32][header_size:u32][num_entries:u32] + * [entry × (entry_off:u32, key_off:u32, key_size:u32, abs_seq_lo:u32, abs_seq_hi:u32)] + * [original decompressed block data] + * + * + * @param data decompressed block data + * @param data_size size of data + * @param out_indexed output -- allocated indexed buffer (caller must free) + * @param out_indexed_size output -- size of indexed buffer + * @return 0 on success, -1 on failure + */ +static int tidesdb_build_indexed_block_data(const uint8_t *data, const size_t data_size, + uint8_t **out_indexed, size_t *out_indexed_size) +{ + if (!data || data_size < sizeof(uint32_t) * 2 || !out_indexed || !out_indexed_size) return -1; + + const uint32_t num_entries = decode_uint32_le_compat(data); + if (num_entries == 0 || num_entries > data_size / 4) return -1; + + const size_t entry_size = TDB_BLOCK_INDEX_ENTRY_STRIDE; + + /* temporary arrays -- stack for small, heap for large */ + typedef struct + { + uint32_t entry_off, key_off, key_sz; + uint64_t abs_seq; + } idx_entry_t; + idx_entry_t stack_entries[TDB_KLOG_BLOCK_STACK_ENTRIES]; + idx_entry_t *entries = (num_entries <= TDB_KLOG_BLOCK_STACK_ENTRIES) + ? stack_entries + : malloc(num_entries * sizeof(idx_entry_t)); + if (!entries) return -1; + + const uint8_t *ptr = data + sizeof(uint32_t) * 2; /* skip num_entries + block_size */ + size_t remaining = data_size - sizeof(uint32_t) * 2; + uint32_t valid = 0; + uint64_t abs_seq = 0; /* running absolute sequence for delta-seq reconstruction */ + + for (uint32_t i = 0; i < num_entries; i++) + { + if (remaining < 1) break; + entries[i].entry_off = (uint32_t)(ptr - data); + + uint8_t flags = *ptr++; + remaining--; + + uint64_t ks, vs, seq_val; + int br = decode_varint(ptr, &ks, (int)remaining); + if (br < 0) break; + ptr += br; + remaining -= br; + + br = decode_varint(ptr, &vs, (int)remaining); + if (br < 0) break; + ptr += br; + remaining -= br; + + br = decode_varint(ptr, &seq_val, (int)remaining); + if (br < 0) break; + ptr += br; + remaining -= br; + + /* we compute absolute seq (resolve delta-seq once during index build) */ + if (flags & TDB_KV_FLAG_DELTA_SEQ) + abs_seq += seq_val; + else + abs_seq = seq_val; + entries[i].abs_seq = abs_seq; + + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) break; + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + br = decode_varint(ptr, &seq_val, (int)remaining); + if (br < 0) break; + ptr += br; + remaining -= br; + } + + if (remaining < ks) break; + entries[i].key_off = (uint32_t)(ptr - data); + entries[i].key_sz = (uint32_t)ks; + ptr += ks; + remaining -= (size_t)ks; + + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0) + { + if (remaining < vs) break; + ptr += vs; + remaining -= (size_t)vs; + } + valid = i + 1; + } + + if (valid == 0) + { + if (entries != stack_entries) free(entries); + return -1; + } + + /* we build indexed buffer = [header][original data] */ + const size_t actual_header = TDB_BLOCK_INDEX_HDR_BASE + valid * entry_size; + const size_t total_size = actual_header + data_size; + + uint8_t *buf = malloc(total_size); + if (!buf) + { + if (entries != stack_entries) free(entries); + return -1; + } + + /* we write header */ + uint8_t *wp = buf; + encode_uint32_le_compat(wp, TDB_BLOCK_INDEX_MAGIC); + wp += 4; + encode_uint32_le_compat(wp, (uint32_t)actual_header); + wp += 4; + encode_uint32_le_compat(wp, valid); + wp += 4; + + for (uint32_t i = 0; i < valid; i++) + { + encode_uint32_le_compat(wp, entries[i].entry_off); + wp += 4; + encode_uint32_le_compat(wp, entries[i].key_off); + wp += 4; + encode_uint32_le_compat(wp, entries[i].key_sz); + wp += 4; + encode_uint32_le_compat(wp, (uint32_t)entries[i].abs_seq); + wp += 4; + encode_uint32_le_compat(wp, (uint32_t)(entries[i].abs_seq >> TDB_U64_HI_LO_SHIFT)); + wp += 4; + } + + /* we copy original data after header */ + memcpy(wp, data, data_size); + + if (entries != stack_entries) free(entries); + + *out_indexed = buf; + *out_indexed_size = total_size; + return 0; +} + +/** + * tidesdb_klog_block_search_raw + * + * @param data serialized (decompressed) klog block bytes + * @param data_size size of data + * @param search_key the key to find + * @param search_key_size size of search key + * @param seq_ceiling highest sequence number to consider (UINT64_MAX = newest). + * a key may appear several times in one block when a flush + * or compaction retains a version chain; the run is scanned + * and the highest seq at or below the ceiling is returned + * @param comparator_fn comparator function + * @param comparator_ctx comparator context + * @param out_entry output -- entry metadata (flags, key_size, value_size, seq, ttl, vlog_offset) + * @param out_key output -- pointer into data buffer for the found key (do not free) + * @param out_value output -- pointer into data buffer for inline value (do not free), NULL if vlog + * @return 0 if found, -1 if not found, -2 on corruption + */ +static int tidesdb_klog_block_search_raw(const uint8_t *data, const size_t data_size, + const uint8_t *search_key, const size_t search_key_size, + const uint64_t seq_ceiling, + skip_list_comparator_fn comparator_fn, + void *comparator_ctx, tidesdb_klog_entry_t *out_entry, + const uint8_t **out_key, const uint8_t **out_value) +{ + if (!data || data_size < sizeof(uint32_t) * 2 || !search_key || !out_entry) return -2; + + const uint32_t maybe_magic = decode_uint32_le_compat(data); + if (maybe_magic == TDB_BLOCK_INDEX_MAGIC) + { + const uint32_t hdr_size = decode_uint32_le_compat(data + 4); + const uint32_t idx_count = decode_uint32_le_compat(data + 8); + + if (hdr_size > data_size || idx_count == 0) return -2; + + const uint8_t *idx_base = data + TDB_BLOCK_INDEX_HDR_BASE; + const uint8_t *block_data = data + hdr_size; + const size_t block_data_size = data_size - hdr_size; + + /* binary search using pre-built index -- O(log N) with zero scanning */ + int32_t left = 0, right = (int32_t)idx_count - 1, found = -1; + while (left <= right) + { + const int32_t mid = left + (right - left) / 2; + const uint8_t *ie = idx_base + mid * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t k_off = decode_uint32_le_compat(ie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t k_sz = decode_uint32_le_compat(ie + TDB_BLOCK_IDX_KEY_SIZE); + const int cmp = comparator_fn(search_key, search_key_size, block_data + k_off, k_sz, + comparator_ctx); + if (cmp == 0) + { + found = mid; + break; + } + if (cmp < 0) + right = mid - 1; + else + left = mid + 1; + } + + if (found < 0) return -1; + + /* a key may have several versions in this block when a flush or + * compaction retained a version chain. they sit in a contiguous run -- + * scan it and keep the highest seq at or below seq_ceiling, the version + * visible at the caller's snapshot. the abs_seq is precomputed in each + * index entry, so the scan needs no entry decoding. */ + { + int32_t run_lo = found; + int32_t run_hi = found; + while (run_lo > 0) + { + const uint8_t *pe = idx_base + (run_lo - 1) * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t pk_off = decode_uint32_le_compat(pe + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t pk_sz = decode_uint32_le_compat(pe + TDB_BLOCK_IDX_KEY_SIZE); + if (comparator_fn(search_key, search_key_size, block_data + pk_off, pk_sz, + comparator_ctx) != 0) + break; + run_lo--; + } + while (run_hi + 1 < (int32_t)idx_count) + { + const uint8_t *ne = idx_base + (run_hi + 1) * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t nk_off = decode_uint32_le_compat(ne + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t nk_sz = decode_uint32_le_compat(ne + TDB_BLOCK_IDX_KEY_SIZE); + if (comparator_fn(search_key, search_key_size, block_data + nk_off, nk_sz, + comparator_ctx) != 0) + break; + run_hi++; + } + + int32_t best = -1; + uint64_t best_seq = 0; + for (int32_t i = run_lo; i <= run_hi; i++) + { + const uint8_t *re = idx_base + i * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t s_lo = decode_uint32_le_compat(re + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t s_hi = decode_uint32_le_compat(re + TDB_BLOCK_IDX_SEQ_HI); + const uint64_t e_seq = ((uint64_t)s_hi << TDB_U64_HI_LO_SHIFT) | s_lo; + if (e_seq > seq_ceiling) continue; + if (best < 0 || e_seq > best_seq) + { + best = i; + best_seq = e_seq; + } + } + if (best < 0) return -1; + found = best; + } + + /* we extract matched entry metadata */ + const uint8_t *fie = idx_base + found * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t e_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_ENTRY_OFF); + const uint32_t k_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t k_sz = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_SIZE); + + const uint8_t *eptr = block_data + e_off; + size_t erem = block_data_size - e_off; + if (erem < 1) return -2; + + uint8_t flags = *eptr++; + erem--; + out_entry->flags = flags & ~(TDB_KV_FLAG_DELTA_SEQ | TDB_KV_FLAG_TRANSIENT_MASK); + + uint64_t ks, vs; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + out_entry->key_size = (uint32_t)ks; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + out_entry->value_size = (uint32_t)vs; + + /* we read pre-computed abs_seq directly from index -- O(1) */ + const uint32_t seq_lo = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t seq_hi = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_HI); + out_entry->seq = ((uint64_t)seq_hi << TDB_U64_HI_LO_SHIFT) | seq_lo; + /* we skip past seq varint in entry data (need to advance eptr for TTL/vlog) */ + uint64_t seq_dummy; + br = decode_varint(eptr, &seq_dummy, (int)erem); + eptr += br; + erem -= br; + + out_entry->ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + out_entry->ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + + out_entry->vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vo; + br = decode_varint(eptr, &vo, (int)erem); + eptr += br; + erem -= br; + out_entry->vlog_offset = vo; + } + + *out_key = block_data + k_off; + if (out_value) + { + *out_value = + (!(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0) ? block_data + k_off + k_sz : NULL; + } + return 0; + } + + /** raw block data from disk (cache miss). + * build offset index via linear scan, then binary search. */ + const uint8_t *ptr = data; + const uint32_t num_entries = decode_uint32_le_compat(ptr); + ptr += sizeof(uint32_t); + ptr += sizeof(uint32_t); /* skip block_size */ + + if (num_entries == 0) return -1; + if (num_entries > data_size / 4) return -2; + + typedef struct + { + uint32_t key_offset; + uint32_t key_size; + } key_index_entry_t; + + key_index_entry_t stack_index[TDB_KLOG_BLOCK_STACK_ENTRIES]; + key_index_entry_t *index = (num_entries <= TDB_KLOG_BLOCK_STACK_ENTRIES) + ? stack_index + : malloc(num_entries * sizeof(key_index_entry_t)); + if (!index) return -2; + + uint32_t *entry_offsets = NULL; + if (num_entries <= TDB_KLOG_BLOCK_STACK_ENTRIES) + { + static THREAD_LOCAL uint32_t tls_offsets[TDB_KLOG_BLOCK_STACK_ENTRIES]; + entry_offsets = tls_offsets; + } + else + { + entry_offsets = malloc(num_entries * sizeof(uint32_t)); + if (!entry_offsets) + { + if (index != stack_index) free(index); + return -2; + } + } + + size_t remaining = data_size - (size_t)(ptr - data); + uint32_t valid_entries = 0; + + for (uint32_t i = 0; i < num_entries; i++) + { + if (remaining < 1) break; + + entry_offsets[i] = (uint32_t)(ptr - data); + + uint8_t flags = *ptr++; + remaining--; + + uint64_t key_size_u64; + int bytes_read = decode_varint(ptr, &key_size_u64, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + + uint64_t value_size_u64; + bytes_read = decode_varint(ptr, &value_size_u64, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + + uint64_t seq_dummy; + bytes_read = decode_varint(ptr, &seq_dummy, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) break; + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vlog_dummy; + bytes_read = decode_varint(ptr, &vlog_dummy, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + } + + if (remaining < key_size_u64) break; + index[i].key_offset = (uint32_t)(ptr - data); + index[i].key_size = (uint32_t)key_size_u64; + ptr += key_size_u64; + remaining -= (size_t)key_size_u64; + + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && value_size_u64 > 0) + { + if (remaining < value_size_u64) break; + ptr += value_size_u64; + remaining -= (size_t)value_size_u64; + } + + valid_entries = i + 1; + } + + if (valid_entries == 0) + { + if (index != stack_index) free(index); + if (num_entries > TDB_KLOG_BLOCK_STACK_ENTRIES) free(entry_offsets); + return -1; + } + + /* binary search using in-place key comparisons */ + int32_t left = 0; + int32_t right = (int32_t)valid_entries - 1; + int32_t found = -1; + + while (left <= right) + { + const int32_t mid = left + (right - left) / 2; + const uint8_t *mid_key = data + index[mid].key_offset; + const int cmp = comparator_fn(search_key, search_key_size, mid_key, index[mid].key_size, + comparator_ctx); + + if (cmp == 0) + { + found = mid; + break; + } + if (cmp < 0) + right = mid - 1; + else + left = mid + 1; + } + + if (found < 0) + { + if (index != stack_index) free(index); + if (num_entries > TDB_KLOG_BLOCK_STACK_ENTRIES) free(entry_offsets); + return -1; + } + + /* a key may have several versions in this block when a flush or compaction + * retained a version chain. they sit in a contiguous run -- scan it and + * keep the highest seq at or below seq_ceiling. delta-seq entries chain + * from the nearest preceding absolute, so we sum abs_seq forward from + * entry 0 in one pass and consider the members inside the run. */ + { + int32_t run_lo = found; + int32_t run_hi = found; + while (run_lo > 0 && + comparator_fn(search_key, search_key_size, data + index[run_lo - 1].key_offset, + index[run_lo - 1].key_size, comparator_ctx) == 0) + run_lo--; + while (run_hi + 1 < (int32_t)valid_entries && + comparator_fn(search_key, search_key_size, data + index[run_hi + 1].key_offset, + index[run_hi + 1].key_size, comparator_ctx) == 0) + run_hi++; + + int32_t best = -1; + uint64_t best_seq = 0; + uint64_t abs_seq = 0; + for (int32_t j = 0; j <= run_hi; j++) + { + const uint8_t *sptr = data + entry_offsets[j]; + const uint8_t sf = *sptr++; + uint64_t dummy, sv; + sptr += decode_varint(sptr, &dummy, TDB_VARINT_MAX_BYTES); /* key_size */ + sptr += decode_varint(sptr, &dummy, TDB_VARINT_MAX_BYTES); /* value_size */ + sptr += decode_varint(sptr, &sv, TDB_VARINT_MAX_BYTES); /* seq */ + if (sf & TDB_KV_FLAG_DELTA_SEQ) + abs_seq += sv; + else + abs_seq = sv; + + if (j >= run_lo && abs_seq <= seq_ceiling && (best < 0 || abs_seq > best_seq)) + { + best = j; + best_seq = abs_seq; + } + } + + if (best < 0) + { + if (index != stack_index) free(index); + if (num_entries > TDB_KLOG_BLOCK_STACK_ENTRIES) free(entry_offsets); + return -1; + } + found = best; + } + + /* we re-parse the single matched entry to extract full metadata */ + const uint8_t *eptr = data + entry_offsets[found]; + size_t erem = data_size - entry_offsets[found]; + + uint8_t flags = *eptr++; + erem--; + out_entry->flags = flags & ~(TDB_KV_FLAG_DELTA_SEQ | TDB_KV_FLAG_TRANSIENT_MASK); + + uint64_t ks; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + out_entry->key_size = (uint32_t)ks; + + uint64_t vs; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + out_entry->value_size = (uint32_t)vs; + + uint64_t seq_val; + br = decode_varint(eptr, &seq_val, (int)erem); + eptr += br; + erem -= br; + + if (flags & TDB_KV_FLAG_DELTA_SEQ) + { + uint64_t abs_seq = 0; + for (int32_t j = 0; j <= found; j++) + { + const uint8_t *sptr = data + entry_offsets[j]; + const uint8_t sf = *sptr++; + uint64_t dummy; + sptr += decode_varint(sptr, &dummy, TDB_VARINT_MAX_BYTES); /* key_size */ + sptr += decode_varint(sptr, &dummy, TDB_VARINT_MAX_BYTES); /* value_size */ + uint64_t sv; + sptr += decode_varint(sptr, &sv, TDB_VARINT_MAX_BYTES); /* seq */ + if (sf & TDB_KV_FLAG_DELTA_SEQ) + abs_seq += sv; + else + abs_seq = sv; + } + out_entry->seq = abs_seq; + } + else + { + out_entry->seq = seq_val; + } + + if (flags & TDB_KV_FLAG_HAS_TTL) + { + out_entry->ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + else + { + out_entry->ttl = 0; + } + + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vlog_off; + br = decode_varint(eptr, &vlog_off, (int)erem); + eptr += br; + erem -= br; + out_entry->vlog_offset = vlog_off; + } + else + { + out_entry->vlog_offset = 0; + } + + /* key pointer -- points directly into the data buffer */ + *out_key = data + index[found].key_offset; + + /* value pointer -- points into data buffer for inline values */ + if (out_value) + { + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0) + { + *out_value = data + index[found].key_offset + index[found].key_size; + } + else + { + *out_value = NULL; + } + } + + if (index != stack_index) free(index); + if (num_entries > TDB_KLOG_BLOCK_STACK_ENTRIES) free(entry_offsets); + return 0; +} + +/** + * tidesdb_klog_block_seek_raw + * find the first entry with key >= target in raw (non-indexed) block data. + * builds a lightweight key-offset index via a single varint scan, then + * binary searches for the first-ge match. only the matched entry is parsed. + * this avoids the O(N) full deserialization that tidesdb_klog_block_deserialize performs. + * @param data raw block data + * @param data_size raw block data size + * @param target_key the target key to seek to + * @param target_key_size the size of the target key + * @param comparator_fn comparator function + * @param comparator_ctx comparator context + * @param out_entry receives parsed entry metadata for the matched entry + * @param out_key receives pointer into data for the matched key + * @param out_value receives pointer into data for the matched inline value (or NULL) + * @param out_idx receives the matched entry index (for lazy state) + * @param out_num_entries receives total number of valid entries in the block + * @return 0 on success, -1 if target is past all entries, -2 on data error + */ +static int tidesdb_klog_block_seek_raw(const uint8_t *data, const size_t data_size, + const uint8_t *target_key, const size_t target_key_size, + skip_list_comparator_fn comparator_fn, void *comparator_ctx, + tidesdb_klog_entry_t *out_entry, const uint8_t **out_key, + const uint8_t **out_value, int *out_idx, + uint32_t *out_num_entries) +{ + if (!data || data_size < sizeof(uint32_t) * 2 || !target_key || !out_entry) return -2; + + /* indexed format fast path -- when the block has a pre-built key offset + * index (TDB_BLOCK_INDEX_MAGIC header), we skip the O(N) varint scan + * entirely and go straight to O(log N) binary search on the index. + * this is the common case for cache hits after the first seek. */ + const uint32_t maybe_magic = decode_uint32_le_compat(data); + if (maybe_magic == TDB_BLOCK_INDEX_MAGIC && data_size >= TDB_BLOCK_INDEX_HDR_BASE) + { + const uint32_t hdr_size = decode_uint32_le_compat(data + 4); + const uint32_t idx_count = decode_uint32_le_compat(data + 8); + + /* the index header + entry offsets are on-disk values. a malformed (but checksum + * valid) block must not drive an out-of-bounds read on this hot indexed path, so + * validate the header geometry once and each entry's key offset before use. */ + if (idx_count == 0 || hdr_size >= data_size || hdr_size < TDB_BLOCK_INDEX_HDR_BASE) + return -1; + + const uint8_t *idx_base = data + TDB_BLOCK_INDEX_HDR_BASE; + const uint8_t *bdata = data + hdr_size; + const size_t bdata_size = data_size - hdr_size; + + /* the idx_count entries must fit in the header region [HDR_BASE, hdr_size) */ + if ((uint64_t)idx_count * TDB_BLOCK_INDEX_ENTRY_STRIDE > + (uint64_t)(hdr_size - TDB_BLOCK_INDEX_HDR_BASE)) + return -1; + + if (out_num_entries) *out_num_entries = idx_count; + + /* binary search for first entry where key >= target */ + int32_t left = 0, right = (int32_t)idx_count - 1, found = -1; + while (left <= right) + { + const int32_t mid = left + (right - left) / 2; + const uint8_t *ie = idx_base + mid * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t k_off = decode_uint32_le_compat(ie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t k_sz = decode_uint32_le_compat(ie + TDB_BLOCK_IDX_KEY_SIZE); + if (k_off > bdata_size || k_sz > bdata_size - k_off) return -1; + const int cmp = + comparator_fn(bdata + k_off, k_sz, target_key, target_key_size, comparator_ctx); + if (cmp >= 0) + { + found = mid; + right = mid - 1; + } + else + { + left = mid + 1; + } + } + + if (found < 0) return -1; + if (out_idx) *out_idx = found; + + /* we extract matched entry metadata from the index */ + const uint8_t *fie = idx_base + found * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t e_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_ENTRY_OFF); + const uint32_t mk_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t mk_sz = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_SIZE); + const uint32_t sq_lo = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t sq_hi = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_HI); + + /* we parse flags, key_size, value_size from the entry data */ + if (e_off >= bdata_size) return -2; /* else bdata_size - e_off wraps below */ + const uint8_t *eptr = bdata + e_off; + size_t erem = bdata_size - e_off; + if (erem < 1) return -2; + + uint8_t flags = *eptr++; + erem--; + out_entry->flags = flags & ~(TDB_KV_FLAG_DELTA_SEQ | TDB_KV_FLAG_TRANSIENT_MASK); + + uint64_t ks, vs; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + out_entry->key_size = (uint32_t)ks; + + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + out_entry->value_size = (uint32_t)vs; + + out_entry->seq = ((uint64_t)sq_hi << TDB_U64_HI_LO_SHIFT) | sq_lo; + + /* we skip past seq varint to reach ttl/vlog */ + uint64_t seq_skip; + br = decode_varint(eptr, &seq_skip, (int)erem); + eptr += br; + erem -= br; + + out_entry->ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (erem >= sizeof(int64_t)) + { + out_entry->ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + } + + out_entry->vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vlog_off; + br = decode_varint(eptr, &vlog_off, (int)erem); + out_entry->vlog_offset = vlog_off; + } + + /* validate the matched key/value offsets before forming pointers into the block */ + if (mk_off > bdata_size || mk_sz > bdata_size - mk_off) return -2; + const int inline_val = !(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0; + if (inline_val && vs > bdata_size - mk_off - mk_sz) return -2; + *out_key = bdata + mk_off; + if (out_value) + { + *out_value = inline_val ? bdata + mk_off + mk_sz : NULL; + } + return 0; + } + + /* raw block data -- build lightweight index via varint scan */ + const uint8_t *ptr = data; + const uint32_t num_entries = decode_uint32_le_compat(ptr); + ptr += sizeof(uint32_t); + ptr += sizeof(uint32_t); /* skip block_size */ + + if (num_entries == 0) return -1; + if (num_entries > data_size / 4) return -2; + + /* lightweight index -- only key offset and size per entry, plus + * entry start offsets for re-parsing the matched entry */ + typedef struct + { + uint32_t key_offset; + uint32_t key_size; + } key_index_entry_t; + + key_index_entry_t stack_index[TDB_KLOG_BLOCK_STACK_ENTRIES]; + key_index_entry_t *index = (num_entries <= TDB_KLOG_BLOCK_STACK_ENTRIES) + ? stack_index + : malloc(num_entries * sizeof(key_index_entry_t)); + if (!index) return -2; + + uint32_t stack_offsets[TDB_KLOG_BLOCK_STACK_ENTRIES]; + uint32_t *entry_offsets = (num_entries <= TDB_KLOG_BLOCK_STACK_ENTRIES) + ? stack_offsets + : malloc(num_entries * sizeof(uint32_t)); + if (!entry_offsets) + { + if (index != stack_index) free(index); + return -2; + } + + /* single varint scan to build key offset index */ + size_t remaining = data_size - (size_t)(ptr - data); + uint32_t valid_entries = 0; + + for (uint32_t i = 0; i < num_entries; i++) + { + if (remaining < 1) break; + + entry_offsets[i] = (uint32_t)(ptr - data); + + uint8_t flags = *ptr++; + remaining--; + + uint64_t key_size_u64; + int bytes_read = decode_varint(ptr, &key_size_u64, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + + uint64_t value_size_u64; + bytes_read = decode_varint(ptr, &value_size_u64, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + + uint64_t seq_dummy; + bytes_read = decode_varint(ptr, &seq_dummy, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) break; + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vlog_dummy; + bytes_read = decode_varint(ptr, &vlog_dummy, (int)remaining); + if (bytes_read < 0) break; + ptr += bytes_read; + remaining -= bytes_read; + } + + if (remaining < key_size_u64) break; + index[i].key_offset = (uint32_t)(ptr - data); + index[i].key_size = (uint32_t)key_size_u64; + ptr += key_size_u64; + remaining -= (size_t)key_size_u64; + + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && value_size_u64 > 0) + { + if (remaining < value_size_u64) break; + ptr += value_size_u64; + remaining -= (size_t)value_size_u64; + } + + valid_entries = i + 1; + } + + if (out_num_entries) *out_num_entries = valid_entries; + + if (valid_entries == 0) + { + if (index != stack_index) free(index); + if (entry_offsets != stack_offsets) free(entry_offsets); + return -1; + } + + /* binary search for first entry where entry_key >= target_key */ + int32_t left = 0; + int32_t right = (int32_t)valid_entries - 1; + int32_t found = -1; + + while (left <= right) + { + const int32_t mid = left + (right - left) / 2; + const uint8_t *mid_key = data + index[mid].key_offset; + const int cmp = comparator_fn(mid_key, index[mid].key_size, target_key, target_key_size, + comparator_ctx); + if (cmp >= 0) + { + found = mid; + right = mid - 1; + } + else + { + left = mid + 1; + } + } + + if (found < 0) + { + /* target is past all entries in this block */ + if (index != stack_index) free(index); + if (entry_offsets != stack_offsets) free(entry_offsets); + return -1; + } + + if (out_idx) *out_idx = found; + + /* re-parse the single matched entry to extract full metadata */ + const uint8_t *eptr = data + entry_offsets[found]; + size_t erem = data_size - entry_offsets[found]; + + uint8_t flags = *eptr++; + erem--; + out_entry->flags = flags & ~(TDB_KV_FLAG_DELTA_SEQ | TDB_KV_FLAG_TRANSIENT_MASK); + + uint64_t ks; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + out_entry->key_size = (uint32_t)ks; + + uint64_t vs; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + out_entry->value_size = (uint32_t)vs; + + /* for the matched entry we need the absolute sequence number. + * if the entry uses delta-seq encoding, we must reconstruct it + * by scanning from entry 0 to found. this is only done for the + * single matched entry -- the scan is cheap since entry_offsets + * gives direct access to each entry's flags+seq bytes. */ + uint64_t abs_seq = 0; + for (int32_t si = 0; si <= found; si++) + { + const uint8_t *sp = data + entry_offsets[si]; + size_t sr = data_size - entry_offsets[si]; + uint8_t sf = *sp++; + sr--; + + uint64_t sk; + int sbr = decode_varint(sp, &sk, (int)sr); + sp += sbr; + sr -= sbr; + uint64_t sv; + sbr = decode_varint(sp, &sv, (int)sr); + sp += sbr; + sr -= sbr; + uint64_t seq_val; + sbr = decode_varint(sp, &seq_val, (int)sr); + + if (sf & TDB_KV_FLAG_DELTA_SEQ) + abs_seq += seq_val; + else + abs_seq = seq_val; + } + out_entry->seq = abs_seq; + + /* we skip past seq varint in the matched entry to reach ttl/vlog fields */ + uint64_t seq_skip; + br = decode_varint(eptr, &seq_skip, (int)erem); + eptr += br; + erem -= br; + + out_entry->ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (erem >= sizeof(int64_t)) + { + out_entry->ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + } + + out_entry->vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vlog_off; + br = decode_varint(eptr, &vlog_off, (int)erem); + eptr += br; + erem -= br; + out_entry->vlog_offset = vlog_off; + } + + *out_key = data + index[found].key_offset; + + if (out_value) + { + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0) + { + *out_value = data + index[found].key_offset + index[found].key_size; + } + else + { + *out_value = NULL; + } + } + + if (index != stack_index) free(index); + if (entry_offsets != stack_offsets) free(entry_offsets); + return 0; +} + +/** + * tidesdb_klog_block_deserialize + * @param data input buffer + * @param data_size input buffer size + * @param block output klog block + * @return 0 on success, -1 on error + */ +static int tidesdb_klog_block_deserialize(const uint8_t *data, const size_t data_size, + tidesdb_klog_block_t **block, const int zero_copy) +{ + if (!data || !data_size || !block) return TDB_ERR_INVALID_ARGS; + + if (data_size < sizeof(uint32_t) * 2) return TDB_ERR_CORRUPTION; + + /* we use arena allocation -- single malloc for entire block structure + * layout -- block_struct | entries[] | keys[] | inline_values[] | key_data | value_data + * when zero_copy=1, keys/values point directly into the source data buffer + * instead of being copied, eliminating the memcpy overhead. + * the caller must keep the source data buffer alive for the block's lifetime. + * this reduces malloc calls from O(N) to O(1) per block */ + const uint8_t *ptr = data; + + const uint32_t num_entries = decode_uint32_le_compat(ptr); + ptr += sizeof(uint32_t); + const uint32_t block_size = decode_uint32_le_compat(ptr); + ptr += sizeof(uint32_t); + + /* num_entries must be reasonable for the data size + * each entry needs at least 4 bytes (flags + 3 varints min) */ + if (num_entries > data_size / 4) return TDB_ERR_CORRUPTION; + + /* arena layout: + * block_struct | entries[] | keys_ptrs[] | values_ptrs[] + * when !zero_copy, also-- | key_data | value_data */ + const size_t hdr_size = sizeof(tidesdb_klog_block_t) + + (num_entries * sizeof(tidesdb_klog_entry_t)) + + (num_entries * sizeof(uint8_t *)) + /* keys array */ + (num_entries * sizeof(uint8_t *)); /* inline_values array */ + + const size_t arena_size = zero_copy ? hdr_size : (hdr_size + data_size); + + uint8_t *arena = malloc(arena_size); + if (!arena) return TDB_ERR_MEMORY; + + /* we partition arena into sections */ + *block = (tidesdb_klog_block_t *)arena; + memset(*block, 0, sizeof(tidesdb_klog_block_t)); + + /* we mark as arena-allocated for proper cleanup */ + (*block)->is_arena_allocated = 1; + (*block)->is_zero_copy = (uint8_t)zero_copy; + + uint8_t *arena_ptr = arena + sizeof(tidesdb_klog_block_t); + (*block)->entries = (tidesdb_klog_entry_t *)arena_ptr; + arena_ptr += num_entries * sizeof(tidesdb_klog_entry_t); + + (*block)->keys = (uint8_t **)arena_ptr; + arena_ptr += num_entries * sizeof(uint8_t *); + + (*block)->inline_values = (uint8_t **)arena_ptr; + arena_ptr += num_entries * sizeof(uint8_t *); + + /* data_arena only used for non-zero-copy mode */ + uint8_t *data_arena = zero_copy ? NULL : arena_ptr; + + (*block)->num_entries = 0; + (*block)->block_size = block_size; + (*block)->capacity = num_entries; + + uint64_t prev_seq = 0; + size_t remaining = data_size - (ptr - data); + size_t data_offset = 0; + + for (uint32_t i = 0; i < num_entries; i++) + { + if (remaining < 1) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Entry exceeds bounds at entry %u", i); + tidesdb_klog_block_free(*block); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + + uint8_t flags = *ptr++; + remaining--; + (*block)->entries[i].flags = flags & ~(TDB_KV_FLAG_DELTA_SEQ | TDB_KV_FLAG_TRANSIENT_MASK); + + uint64_t key_size_u64; + int bytes_read = decode_varint(ptr, &key_size_u64, (int)remaining); + if (bytes_read < 0 || key_size_u64 > UINT32_MAX) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Invalid key_size varint at entry %u", i); + tidesdb_klog_block_free(*block); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + ptr += bytes_read; + remaining -= bytes_read; + (*block)->entries[i].key_size = (uint32_t)key_size_u64; + + uint64_t value_size_u64; + bytes_read = decode_varint(ptr, &value_size_u64, (int)remaining); + if (bytes_read < 0 || value_size_u64 > UINT32_MAX) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Invalid value_size varint at entry %u", i); + tidesdb_klog_block_free(*block); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + ptr += bytes_read; + remaining -= bytes_read; + (*block)->entries[i].value_size = (uint32_t)value_size_u64; + + uint64_t seq_value; + bytes_read = decode_varint(ptr, &seq_value, (int)remaining); + if (bytes_read < 0) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Invalid seq varint at entry %u", i); + tidesdb_klog_block_free(*block); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + ptr += bytes_read; + remaining -= bytes_read; + + if (flags & TDB_KV_FLAG_DELTA_SEQ) + { + (*block)->entries[i].seq = prev_seq + seq_value; + } + else + { + (*block)->entries[i].seq = seq_value; + } + prev_seq = (*block)->entries[i].seq; + + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "TTL exceeds bounds at entry %u", i); + tidesdb_klog_block_free(*block); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + (*block)->entries[i].ttl = decode_int64_le_compat(ptr); + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + else + { + (*block)->entries[i].ttl = 0; + } + + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vlog_offset; + bytes_read = decode_varint(ptr, &vlog_offset, (int)remaining); + if (bytes_read < 0) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Invalid vlog_offset varint at entry %u", i); + tidesdb_klog_block_free(*block); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + ptr += bytes_read; + remaining -= bytes_read; + (*block)->entries[i].vlog_offset = vlog_offset; + } + else + { + (*block)->entries[i].vlog_offset = 0; + } + + if (remaining < (*block)->entries[i].key_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Key data exceeds bounds at entry %u", i); + free(arena); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + + if (zero_copy) + { + (*block)->keys[i] = (uint8_t *)ptr; + } + else + { + /* we copy into arena */ + (*block)->keys[i] = data_arena + data_offset; + memcpy((*block)->keys[i], ptr, (*block)->entries[i].key_size); + data_offset += (*block)->entries[i].key_size; + } + ptr += (*block)->entries[i].key_size; + remaining -= (*block)->entries[i].key_size; + + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && (*block)->entries[i].value_size > 0) + { + if (remaining < (*block)->entries[i].value_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Inline value exceeds bounds at entry %u", i); + free(arena); + *block = NULL; + return TDB_ERR_CORRUPTION; + } + + if (zero_copy) + { + /* we point directly into the source data buffer */ + (*block)->inline_values[i] = (uint8_t *)ptr; + } + else + { + (*block)->inline_values[i] = data_arena + data_offset; + memcpy((*block)->inline_values[i], ptr, (*block)->entries[i].value_size); + data_offset += (*block)->entries[i].value_size; + } + ptr += (*block)->entries[i].value_size; + remaining -= (*block)->entries[i].value_size; + } + else + { + (*block)->inline_values[i] = NULL; + } + } + + (*block)->num_entries = num_entries; + + if (num_entries > 0) + { + const uint32_t last_idx = num_entries - 1; + if (zero_copy) + { + /* in zero-copy mode, keys[last_idx] points into the source buffer + * which is kept alive by the caller. iterator seeks use + * keys[num_entries-1] directly, so max_key is not needed. + * skip the malloc+memcpy to eliminate the last per-block allocation. */ + (*block)->max_key = NULL; + (*block)->max_key_size = (*block)->entries[last_idx].key_size; + } + else + { + (*block)->max_key = malloc((*block)->entries[last_idx].key_size); + if ((*block)->max_key) + { + memcpy((*block)->max_key, (*block)->keys[last_idx], + (*block)->entries[last_idx].key_size); + (*block)->max_key_size = (*block)->entries[last_idx].key_size; + } + } + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_vlog_read_value + * read a value from vlog + * @param db database instance + * @param sst sstable containing vlog + * @param vlog_offset offset of value in vlog + * @param value_size size of value + * @param value output value + * @return 0 on success, -1 on error + */ +static int tidesdb_vlog_read_value(const tidesdb_t *db, tidesdb_sstable_t *sst, + const uint64_t vlog_offset, const size_t value_size, + uint8_t **value) +{ + if (!db || !sst || !value) return TDB_ERR_INVALID_ARGS; + + /* the vlog is opened lazily on first non-inline value read. the const cast is safe: + * opening the vlog mutates sst (not db's logical state) and does not touch + * num_open_sstables, which is keyed on the klog. */ + if (tidesdb_sstable_ensure_vlog_open((tidesdb_t *)db, sst) != 0) + { + return TDB_ERR_IO; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + return TDB_ERR_IO; + } + + /* vlog_offset is a direct file offset pointing to the vlog block containing the raw value */ + uint32_t block_size; + if (block_manager_get_block_size_at_offset(bms.vlog_bm, vlog_offset, &block_size) != 0) + { + return TDB_ERR_IO; + } + + if (block_size == 0 || block_size > UINT32_MAX / 2) return TDB_ERR_CORRUPTION; + + uint8_t *block_data = malloc(block_size); + if (!block_data) + { + return TDB_ERR_MEMORY; + } + + const uint64_t data_offset = vlog_offset + BLOCK_MANAGER_BLOCK_HEADER_SIZE; + if (block_manager_read_at_offset(bms.vlog_bm, data_offset, block_size, block_data) != 0) + { + free(block_data); + return TDB_ERR_IO; + } + + if (sst->config && sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + uint8_t *decompressed = decompress_data(block_data, block_size, &decompressed_size, + sst->config->compression_algorithm); + if (decompressed) + { + free(block_data); + *value = decompressed; + + /*** we validate size if provided */ + if (value_size > 0 && decompressed_size != value_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Value size mismatch (expected %zu, got %zu)", + value_size, decompressed_size); + free(*value); + *value = NULL; + return TDB_ERR_CORRUPTION; + } + return TDB_SUCCESS; + } + /* decompression failed */ + free(block_data); + return TDB_ERR_CORRUPTION; + } + + *value = block_data; + + if (value_size > 0 && block_size != value_size) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "Value size mismatch (expected %zu, got %u)", value_size, + block_size); + free(*value); + *value = NULL; + return TDB_ERR_CORRUPTION; + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_sstable_get_block_managers + * gets block managers for an sstable through the cache + * @param db database instance + * @param sst sstable + * @param bms output block managers structure + * @return TDB_SUCCESS on success, TDB_ERR_IO on failure + */ +static int tidesdb_sstable_get_block_managers(const tidesdb_t *db, tidesdb_sstable_t *sst, + tidesdb_block_managers_t *bms) +{ + if (!db || !sst || !bms) return TDB_ERR_IO; + + bms->klog_bm = sst->klog_bm; + bms->vlog_bm = sst->vlog_bm; + + /* the vlog is opened lazily, so it may legitimately be NULL here; only the klog is + * guaranteed open. callers that read values must first call + * tidesdb_sstable_ensure_vlog_open (tidesdb_vlog_read_value does so at its top). */ + if (!bms->klog_bm) + { + return TDB_ERR_IO; + } + + return TDB_SUCCESS; +} + +/** + * tdb_path_to_object_key + * convert a local file path to an object store key by stripping the db_path prefix. + * e.g. "/var/lib/tidesdb/mycf/L1_42.klog" -> "mycf/L1_42.klog" + * @param db database instance (used to determine db_path prefix) + * @param local_path absolute local file path to convert + * @param key_out output buffer for the resulting object key + * @param key_buf_size size of the key_out buffer + */ +static void tdb_path_to_object_key(const tidesdb_t *db, const char *local_path, char *key_out, + const size_t key_buf_size) +{ + const char *base = db->db_path; + const size_t base_len = strlen(base); + const size_t path_len = strlen(local_path); + + /* we guard against local_path that does not start with db_path */ + if (path_len <= base_len || strncmp(local_path, base, base_len) != 0) + { + snprintf(key_out, key_buf_size, "%s", local_path); + return; + } + + const char *rel = local_path + base_len; + if (*rel == '/' || *rel == '\\') rel++; + snprintf(key_out, key_buf_size, "%s", rel); +} + +/** + * tdb_upload_job_t + * background upload job for the async upload pipeline + * @param local_path local file path of the file to upload + * @param object_key object store key derived from local_path + * @param wal_generation WAL generation to fence after upload (0 = no fence) + */ +typedef struct +{ + char local_path[TDB_MAX_PATH_LEN]; + char object_key[TDB_MAX_PATH_LEN]; + uint64_t wal_generation; /* WAL gen to fence after upload (0 = no fence) */ +} tdb_upload_job_t; + +/** + * tdb_upload_worker_thread + * background thread that dequeues upload jobs and calls connector->put + * @param arg pointer to the tidesdb_t instance + * @return NULL on thread exit + */ +static void *tdb_upload_worker_thread(void *arg) +{ + tidesdb_t *db = (tidesdb_t *)arg; + + while (1) + { + tdb_upload_job_t *job = (tdb_upload_job_t *)queue_dequeue_wait(db->upload_queue); + if (!job) break; /* NULL = shutdown signal */ + + if (db->object_store && db->object_store->put) + { + int rc = -1; + unsigned int backoff_us = TDB_UPLOAD_INITIAL_BACKOFF_US; + for (int attempt = 0; attempt < TDB_UPLOAD_MAX_RETRIES; attempt++) + { + rc = db->object_store->put(db->object_store->ctx, job->object_key, job->local_path); + if (rc != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Upload attempt %d/%d failed: %s", attempt + 1, + TDB_UPLOAD_MAX_RETRIES, job->object_key); + } + else if (!strstr(job->object_key, TDB_COLUMN_FAMILY_MANIFEST_NAME)) + { + /* verify the upload landed with the correct size. MANIFEST is skipped -- + * it is mutable and may grow between upload and the exists check due to + * concurrent flushes. a verify mismatch now retries the put (rc=-1 below) + * instead of being a permanent failure with no re-upload. */ + struct stat local_st; + if (stat(job->local_path, &local_st) == 0) + { + size_t remote_size = 0; + const int verify = db->object_store->exists(db->object_store->ctx, + job->object_key, &remote_size); + if (verify != 1 || remote_size != (size_t)local_st.st_size) + { + TDB_DEBUG_LOG( + TDB_LOG_ERROR, + "Upload verification failed for %s (local=%zu, remote=%zu, " + "exists=%d)", + job->object_key, (size_t)local_st.st_size, remote_size, verify); + rc = -1; + } + } + } + + if (rc == 0) break; + + if (attempt + 1 < TDB_UPLOAD_MAX_RETRIES) + { + usleep(backoff_us); + backoff_us *= TDB_UPLOAD_BACKOFF_MULTIPLIER; + } + } + + if (rc == 0) + { + atomic_fetch_add_explicit(&db->total_uploads, 1, memory_order_relaxed); + + /* we update WAL fence if this upload advances it */ + if (job->wal_generation > 0) + { + uint64_t cur = + atomic_load_explicit(&db->last_uploaded_gen, memory_order_relaxed); + while (job->wal_generation > cur) + { + if (atomic_compare_exchange_weak_explicit( + &db->last_uploaded_gen, &cur, job->wal_generation, + memory_order_release, memory_order_relaxed)) + break; + } + + /* the rotated WAL is now confirmed present on the object + * store (the exists + size verify above proved it), so the + * upload worker deletes the local copy here. this replaces + * the reaper's old synchronous per-generation exists() sweep. + * recovery can replay the WAL from the object store if the + * node restarts before the immutable has flushed. */ + tdb_unlink(job->local_path); + } + } + else + { + atomic_fetch_add_explicit(&db->total_upload_failures, 1, memory_order_relaxed); + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Upload permanently failed after %d attempts: %s", + TDB_UPLOAD_MAX_RETRIES, job->object_key); + } + } + + free(job); + } + + return NULL; +} + +/** + * tdb_objstore_enqueue_upload + * enqueue a file for background upload. non-blocking. + * @param db database instance + * @param local_path local file path to upload + * @param wal_generation WAL generation to fence after upload (0 = no fence) + */ +static void tdb_objstore_enqueue_upload(const tidesdb_t *db, const char *local_path, + const uint64_t wal_generation) +{ + if (!db->object_store || !db->upload_queue || !local_path) return; + + tdb_upload_job_t *job = malloc(sizeof(tdb_upload_job_t)); + if (!job) return; + + snprintf(job->local_path, sizeof(job->local_path), "%s", local_path); + tdb_path_to_object_key(db, local_path, job->object_key, sizeof(job->object_key)); + job->wal_generation = wal_generation; + + if (queue_enqueue(db->upload_queue, job) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to enqueue upload job: %s", job->object_key); + free(job); + } +} + +/** + * tdb_objstore_upload_file_sync + * upload a local file synchronously (blocks until complete). + * used for small metadata files (config.ini, MANIFEST) that must be + * visible immediately after the call returns. + * @param db database instance + * @param local_path local file path to upload + */ +static void tdb_objstore_upload_file_sync(const tidesdb_t *db, const char *local_path) +{ + if (!db->object_store || !local_path) return; + char key[TDB_MAX_PATH_LEN]; + tdb_path_to_object_key(db, local_path, key, sizeof(key)); + + /* we retry with exponential backoff matching the async upload worker */ + unsigned int delay_us = TDB_UPLOAD_INITIAL_BACKOFF_US; + for (int attempt = 0; attempt < TDB_UPLOAD_MAX_RETRIES; attempt++) + { + if (db->object_store->put(db->object_store->ctx, key, local_path) == 0) return; + + TDB_DEBUG_LOG(TDB_LOG_WARN, "Object store sync upload attempt %d/%d failed: %s", + attempt + 1, TDB_UPLOAD_MAX_RETRIES, key); + if (attempt + 1 < TDB_UPLOAD_MAX_RETRIES) usleep(delay_us); + delay_us *= TDB_UPLOAD_BACKOFF_MULTIPLIER; + } + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Object store sync upload failed after %d attempts: %s", + TDB_UPLOAD_MAX_RETRIES, key); +} + +/** + * tdb_objstore_upload_file + * upload a local file to the object store. + * uses async pipeline for sstable data files, falls back to synchronous. + * @param db database instance + * @param local_path local file path to upload + */ +static void tdb_objstore_upload_file(tidesdb_t *db, const char *local_path) +{ + if (!db->object_store || !local_path) return; + + /* we use async pipeline if upload queue exists */ + if (db->upload_queue) + { + tdb_objstore_enqueue_upload(db, local_path, 0); + return; + } + + /* we fallback to synchronous upload */ + char key[TDB_MAX_PATH_LEN]; + tdb_path_to_object_key(db, local_path, key, sizeof(key)); + if (db->object_store->put(db->object_store->ctx, key, local_path) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Object store upload failed: %s", key); + } +} + +/** + * tdb_objstore_delete_file + * delete an object from the object store corresponding to a local path. + * retries with exponential backoff on transient failures. + * @param db database instance + * @param local_path local file path whose corresponding object should be deleted + */ +static void tdb_objstore_delete_file(const tidesdb_t *db, const char *local_path) +{ + if (!db->object_store || !local_path) return; + char key[TDB_MAX_PATH_LEN]; + tdb_path_to_object_key(db, local_path, key, sizeof(key)); + + unsigned int delay_us = TDB_UPLOAD_INITIAL_BACKOFF_US; + for (int attempt = 0; attempt < TDB_UPLOAD_MAX_RETRIES; attempt++) + { + if (db->object_store->delete_object(db->object_store->ctx, key) == 0) return; + + TDB_DEBUG_LOG(TDB_LOG_WARN, "Object store delete attempt %d/%d failed: %s", attempt + 1, + TDB_UPLOAD_MAX_RETRIES, key); + if (attempt + 1 < TDB_UPLOAD_MAX_RETRIES) usleep(delay_us); + delay_us *= TDB_UPLOAD_BACKOFF_MULTIPLIER; + } + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Object store delete failed after %d attempts: %s", + TDB_UPLOAD_MAX_RETRIES, key); +} + +/** + * tdb_objstore_download_if_missing + * download a file from object store if it does not exist locally. + * creates intermediate directories as needed. + * @param db database instance + * @param local_path local file path to check and potentially download + * @return 0 if file is available locally (existed or downloaded), -1 on error + */ +static int tdb_objstore_download_if_missing(const tidesdb_t *db, const char *local_path) +{ + if (!db->object_store) return 0; + + struct stat st; + if (stat(local_path, &st) == 0) + { + /* file exists locally */ + if (db->local_cache) tdb_local_cache_touch(db->local_cache, local_path); + return 0; + } + + /* we create parent directory if needed */ + char dir_buf[TDB_MAX_PATH_LEN]; + snprintf(dir_buf, sizeof(dir_buf), "%s", local_path); + char *last_sep = strrchr(dir_buf, '/'); +#ifdef _WIN32 + char *last_bsep = strrchr(dir_buf, '\\'); + if (last_bsep && (!last_sep || last_bsep > last_sep)) last_sep = last_bsep; +#endif + if (last_sep) + { + *last_sep = '\0'; + mkdir(dir_buf, TDB_DIR_PERMISSIONS); + } + + char key[TDB_MAX_PATH_LEN]; + tdb_path_to_object_key(db, local_path, key, sizeof(key)); + + /*** we check if object exists in store before attempting download. + ** during flush, new sstables are being created locally for the first time + ** and dont exist in the object store yet, that is not an error. + * exists returns 0 not found, 1 found, -1 error. + *** on -1 we treat it as not in remote and let block_manager_open create + ** the file locally. a transient head failure on a read path will resolve + * on the next access. attempting download with no real remote object + ** burns retries and aborts a fresh sstable write, which is worse. */ + const int exists_rc = db->object_store->exists(db->object_store->ctx, key, NULL); + if (exists_rc != 1) + { + return 0; + } + + /* exists_rc == 1, object is confirmed remote, we attempt download with retry */ + { + int get_rc = -1; + unsigned int backoff_us = TDB_DOWNLOAD_INITIAL_BACKOFF_US; + for (int attempt = 0; attempt < TDB_DOWNLOAD_MAX_RETRIES; attempt++) + { + get_rc = db->object_store->get(db->object_store->ctx, key, local_path); + if (get_rc == 0) break; + + TDB_DEBUG_LOG(TDB_LOG_WARN, "Download attempt %d/%d failed: %s", attempt + 1, + TDB_DOWNLOAD_MAX_RETRIES, key); + + if (attempt + 1 < TDB_DOWNLOAD_MAX_RETRIES) + { + usleep(backoff_us); + backoff_us *= TDB_DOWNLOAD_BACKOFF_MULTIPLIER; + } + } + + if (get_rc != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Object store download permanently failed after %d attempts: %s", + TDB_DOWNLOAD_MAX_RETRIES, key); + return -1; + } + } + + if (db->local_cache) tdb_local_cache_track(db->local_cache, local_path); + return 0; +} + +/** + * tidesdb_sstable_range_get_block + * fetch a single klog block from the object store via range_get without downloading + * the full file. reads the 8-byte header (size + checksum) and block data in a single + * range_get call, verifies the checksum, and decompresses if needed. + * @param db database instance + * @param sst sstable (for object key derivation and compression config) + * @param block_offset byte offset of the block in the klog file + * @param block_out receives the decompressed block (caller must free via + * block_manager_block_release) + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_range_get_block(const tidesdb_t *db, const tidesdb_sstable_t *sst, + const uint64_t block_offset, + block_manager_block_t **block_out) +{ + if (!db->object_store || !sst->klog_path) return -1; + + char key[TDB_MAX_PATH_LEN]; + tdb_path_to_object_key(db, sst->klog_path, key, sizeof(key)); + + /* we read header (8 bytes) + max uncompressed block size in one range_get call. + * compressed blocks are smaller than TDB_KLOG_BLOCK_SIZE so this always covers + * the full block. we parse the actual size from the header and use only that. */ + const size_t max_read = BLOCK_MANAGER_BLOCK_HEADER_SIZE + TDB_KLOG_BLOCK_SIZE; + uint8_t *buf = malloc(max_read); + if (!buf) return -1; + + const ssize_t nread = + db->object_store->range_get(db->object_store->ctx, key, block_offset, buf, max_read); + if (nread < (ssize_t)BLOCK_MANAGER_BLOCK_HEADER_SIZE) + { + free(buf); + return -1; + } + + const uint32_t block_size = decode_uint32_le_compat(buf); + const uint32_t stored_checksum = decode_uint32_le_compat(buf + BLOCK_MANAGER_SIZE_FIELD_SIZE); + + if (block_size == 0 || nread < (ssize_t)(BLOCK_MANAGER_BLOCK_HEADER_SIZE + block_size)) + { + free(buf); + return -1; + } + + const uint8_t *block_data = buf + BLOCK_MANAGER_BLOCK_HEADER_SIZE; + + /* we verify checksum (XXH32 with seed 0, matching block_manager) */ + if (XXH32(block_data, block_size, 0) != stored_checksum) + { + free(buf); + return -1; + } + + /* we create block, copying data out of the read buffer */ + block_manager_block_t *block = malloc(sizeof(block_manager_block_t)); + if (!block) + { + free(buf); + return -1; + } + + block->data = malloc(block_size); + if (!block->data) + { + free(block); + free(buf); + return -1; + } + + memcpy(block->data, block_data, block_size); + block->size = block_size; + atomic_init(&block->ref_count, 1); + block->inline_data = 0; + free(buf); + + /* we decompress if needed */ + if (sst->config && sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + uint8_t *decompressed = decompress_data(block->data, block->size, &decompressed_size, + sst->config->compression_algorithm); + if (decompressed) + { + if (!block->inline_data) free(block->data); + block->data = decompressed; + block->size = decompressed_size; + block->inline_data = 0; + } + else + { + block_manager_block_release(block); + return -1; + } + } + + *block_out = block; + return 0; +} + +/** + * tidesdb_vlog_range_get_value + * fetch a value from the vlog via range_get without downloading the full vlog file. + * reads the block header to get the size, then the block data, verifies checksum, + * and decompresses if needed. + * @param db database instance + * @param sst sstable (for vlog object key and compression config) + * @param vlog_offset byte offset of the vlog block + * @param value_size expected value size (0 = unknown) + * @param value receives the value data (caller must free) + * @return 0 on success, non-zero on error + */ +static int tidesdb_vlog_range_get_value(const tidesdb_t *db, const tidesdb_sstable_t *sst, + const uint64_t vlog_offset, const size_t value_size, + uint8_t **value) +{ + if (!db->object_store || !sst->vlog_path) return TDB_ERR_IO; + + char key[TDB_MAX_PATH_LEN]; + tdb_path_to_object_key(db, sst->vlog_path, key, sizeof(key)); + + /* we read header first to get block size */ + uint8_t header[BLOCK_MANAGER_BLOCK_HEADER_SIZE]; + const ssize_t hread = db->object_store->range_get(db->object_store->ctx, key, vlog_offset, + header, BLOCK_MANAGER_BLOCK_HEADER_SIZE); + if (hread < (ssize_t)BLOCK_MANAGER_BLOCK_HEADER_SIZE) return TDB_ERR_IO; + + const uint32_t block_size = decode_uint32_le_compat(header); + if (block_size == 0 || block_size > UINT32_MAX / 2) return TDB_ERR_CORRUPTION; + + const uint32_t stored_checksum = + decode_uint32_le_compat(header + BLOCK_MANAGER_SIZE_FIELD_SIZE); + + uint8_t *block_data = malloc(block_size); + if (!block_data) return TDB_ERR_MEMORY; + + ssize_t dread = db->object_store->range_get(db->object_store->ctx, key, + vlog_offset + BLOCK_MANAGER_BLOCK_HEADER_SIZE, + block_data, block_size); + if (dread < (ssize_t)block_size) + { + free(block_data); + return TDB_ERR_IO; + } + + /* we verify checksum (XXH32 with seed 0, matching block_manager) */ + if (XXH32(block_data, block_size, 0) != stored_checksum) + { + free(block_data); + return TDB_ERR_CORRUPTION; + } + + if (sst->config && sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + uint8_t *decompressed = decompress_data(block_data, block_size, &decompressed_size, + sst->config->compression_algorithm); + if (decompressed) + { + free(block_data); + *value = decompressed; + if (value_size > 0 && decompressed_size != value_size) + { + free(*value); + *value = NULL; + return TDB_ERR_CORRUPTION; + } + return TDB_SUCCESS; + } + free(block_data); + return TDB_ERR_IO; + } + + *value = block_data; + return TDB_SUCCESS; +} + +/** + * tdb_objstore_upload_manifest + * upload the MANIFEST file to object store after a commit. + * @param db database instance + * @param cf column family whose MANIFEST should be uploaded + */ +static void tdb_objstore_upload_manifest(tidesdb_t *db, tidesdb_column_family_t *cf) +{ + if (!db->object_store || !cf || !cf->manifest || cf->manifest->path[0] == '\0') return; + /* a replica must never push to the bucket -- its manifest is a local mirror that a + * close-time flush/compaction can diverge from the primary's authoritative one. uploading + * it would obsolete the primary's real-data sstables. mirrors the sstable-upload gate in + * tidesdb_level_add_sstable; promotion clears replica_mode before any primary write. */ + if (atomic_load_explicit(&db->replica_mode, memory_order_acquire)) return; + /* MANIFEST is uploaded via the async pipeline to avoid blocking flush workers. + * the local MANIFEST is always up to date for same-node readers. remote readers + * doing cold start will see it after the upload completes. the async queue + * preserves ordering so the MANIFEST always reflects the latest sstable inventory. */ + tdb_objstore_upload_file(db, cf->manifest->path); +} + +/** + * tdb_prefetch_arg_t + * thread argument for parallel sstable prefetch during iterator creation + * @param db database instance + * @param local_path path to download + */ +typedef struct +{ + tidesdb_t *db; + const char *local_path; +} tdb_prefetch_arg_t; + +/** + * tdb_prefetch_worker + * download a single file from object store (runs on worker thread) + * @param arg pointer to tdb_prefetch_arg_t + * @return NULL + */ +static void *tdb_prefetch_worker(void *arg) +{ + tdb_prefetch_arg_t *ctx = (tdb_prefetch_arg_t *)arg; + tdb_objstore_download_if_missing(ctx->db, ctx->local_path); + return NULL; +} + +/** + * tdb_objstore_prefetch_sstables + * prefetch non-local sstable files in parallel for iterator creation. + * checks which klog and vlog files are missing locally and downloads them + * concurrently using one thread per file pair, bounded by max_concurrent_downloads. + * blocks until all downloads complete. + * @param db database instance + * @param ssts array of sstable pointers + * @param count number of sstables + */ +static void tdb_objstore_prefetch_sstables(tidesdb_t *db, tidesdb_sstable_t **ssts, const int count) +{ + if (!db->object_store || count == 0) return; + + int max_threads = db->config.object_store_config + ? db->config.object_store_config->max_concurrent_downloads + : 8; + if (max_threads <= 0) max_threads = 8; + + /* we collect non-local files (klog + vlog pairs) */ + tdb_prefetch_arg_t *args = malloc(count * 2 * sizeof(tdb_prefetch_arg_t)); + if (!args) return; + + int num_missing = 0; + for (int i = 0; i < count; i++) + { + if (!ssts[i] || !ssts[i]->klog_path || !ssts[i]->vlog_path) continue; + + struct stat st; + if (stat(ssts[i]->klog_path, &st) != 0) + { + args[num_missing].db = db; + args[num_missing].local_path = ssts[i]->klog_path; + num_missing++; + } + if (stat(ssts[i]->vlog_path, &st) != 0) + { + args[num_missing].db = db; + args[num_missing].local_path = ssts[i]->vlog_path; + num_missing++; + } + } + + if (num_missing == 0) + { + free(args); + return; + } + + /* we download in batches of max_threads */ + pthread_t *threads = malloc(max_threads * sizeof(pthread_t)); + if (!threads) + { + free(args); + return; + } + + int idx = 0; + while (idx < num_missing) + { + const int batch = (num_missing - idx < max_threads) ? (num_missing - idx) : max_threads; + int launched = 0; + + for (int i = 0; i < batch; i++) + { + if (pthread_create(&threads[launched], NULL, tdb_prefetch_worker, &args[idx + i]) == 0) + { + launched++; + } + else + { + tdb_prefetch_worker(&args[idx + i]); /* fallback to sync */ + } + } + + for (int i = 0; i < launched; i++) + { + pthread_join(threads[i], NULL); + } + + idx += batch; + } + + free(threads); + free(args); +} + +/** + * tdb_replica_sync_manifests + * for each CF, download the remote MANIFEST and diff against local. + * new sstables (in remote but not local) are loaded and added to levels. + * removed sstables (in local but not remote) are removed from levels. + * @param db database instance in replica mode + */ +static void tdb_replica_sync_manifests(tidesdb_t *db) +{ + /* we discover and create any new CFs the primary added since last sync */ + tdb_replica_discover_new_cfs(db); + + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (!cf || !cf->manifest) continue; + + /* we download remote MANIFEST to a temp path */ + char remote_key[TDB_MAX_PATH_LEN]; + snprintf(remote_key, sizeof(remote_key), "%s/" TDB_COLUMN_FAMILY_MANIFEST_NAME, cf->name); + char tmp_path[TDB_MAX_PATH_LEN]; + snprintf(tmp_path, sizeof(tmp_path), "%s" PATH_SEPARATOR TDB_REPLICA_MANIFEST_TMP, + cf->directory); + + if (db->object_store->get(db->object_store->ctx, remote_key, tmp_path) != 0) continue; + + tidesdb_manifest_t *remote_manifest = tidesdb_manifest_open(tmp_path); + if (!remote_manifest) + { + tdb_unlink(tmp_path); + continue; + } + + /* we collect new sstables (in remote, not in local) */ + for (int r = 0; r < remote_manifest->num_entries; r++) + { + tidesdb_manifest_entry_t *rme = &remote_manifest->entries[r]; + if (tidesdb_manifest_has_sstable(cf->manifest, rme->level, rme->id)) continue; + + char sst_base[MAX_FILE_PATH_LENGTH]; + snprintf(sst_base, sizeof(sst_base), "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d", + cf->directory, rme->level); + + /* we ensure level directory exists */ + mkdir(sst_base, TDB_DIR_PERMISSIONS); + + tidesdb_sstable_t *sst = tidesdb_sstable_create(db, sst_base, rme->id, &cf->config); + if (!sst) continue; + + sst->num_entries = rme->num_entries; + sst->klog_size = rme->size_bytes; + sst->db = db; + + if (tidesdb_sstable_ensure_open(db, sst) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Replica sync SSTable %d (L%d) not available for CF '%s'", + (int)rme->id, rme->level, cf->name); + tidesdb_sstable_unref(db, sst); + continue; + } + + /* we close BMs from ensure_open before load opens its own */ + if (sst->klog_bm) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + } + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + atomic_fetch_sub(&db->num_open_sstables, 1); + + tidesdb_sstable_load(db, sst); + + int level_idx = rme->level - 1; + if (level_idx >= 0 && level_idx < atomic_load(&cf->num_active_levels) && + cf->levels[level_idx]) + { + tidesdb_level_add_sstable(cf->levels[level_idx], sst); + tidesdb_manifest_add_sstable(cf->manifest, rme->level, rme->id, rme->num_entries, + rme->size_bytes); + + uint64_t cur_next = + atomic_load_explicit(&cf->next_sstable_id, memory_order_relaxed); + if (rme->id >= cur_next) + atomic_store_explicit(&cf->next_sstable_id, rme->id + 1, memory_order_relaxed); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync: added SSTable %d (L%d) for CF '%s'", + (int)rme->id, rme->level, cf->name); + } + tidesdb_sstable_unref(db, sst); + } + + /* we collect removed sstables (in local, not in remote) */ + for (int l = cf->manifest->num_entries - 1; l >= 0; l--) + { + tidesdb_manifest_entry_t *lme = &cf->manifest->entries[l]; + if (tidesdb_manifest_has_sstable(remote_manifest, lme->level, lme->id)) continue; + + int level_idx = lme->level - 1; + if (level_idx >= 0 && level_idx < atomic_load(&cf->num_active_levels) && + cf->levels[level_idx]) + { + /* hold array_readers while scanning the (lock-free, retire-able) sstables + * array and try_ref the match before touching it -- without this the array + * could be retired and freed, or the sstable unref'd to 0, under our raw + * pointer. the extra ref pins the sstable across level_remove_sstable (which + * drops the array's base ref) and is released right after. mirrors every + * other array reader; this replica-sync path was the lone exception. */ + tidesdb_level_t *lvl = cf->levels[level_idx]; + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + tidesdb_sstable_t **ssts = + atomic_load_explicit(&lvl->sstables, memory_order_acquire); + tidesdb_sstable_t *target = NULL; + for (int s = 0; ssts[s] != NULL; s++) + { + if (ssts[s]->id == lme->id && tidesdb_sstable_try_ref(ssts[s])) + { + target = ssts[s]; + break; + } + } + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + if (target) + { + atomic_store(&target->marked_for_deletion, 1); + tidesdb_level_remove_sstable(db, lvl, target); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync removed SSTable %d (L%d) for CF '%s'", + (int)lme->id, lme->level, cf->name); + tidesdb_sstable_unref(db, target); + } + } + tidesdb_manifest_remove_sstable(cf->manifest, lme->level, lme->id); + } + + tidesdb_manifest_close(remote_manifest); + tdb_unlink(tmp_path); + } + pthread_rwlock_unlock(&db->cf_list_lock); +} + +/** + * tdb_wal_discovery_ctx_t + * context for WAL generation discovery from object store list() callback + */ +#define TDB_WAL_DISCOVERY_MAX 256 + +typedef struct +{ + uint64_t generations[TDB_WAL_DISCOVERY_MAX]; + int count; +} tdb_wal_discovery_ctx_t; + +/** + * tdb_wal_discovery_cb + * list() callback that extracts WAL generation numbers from object keys + * matching the pattern uwal_.log + */ +static void tdb_wal_discovery_cb(const char *key, const size_t size, void *cb_ctx) +{ + (void)size; + tdb_wal_discovery_ctx_t *ctx = (tdb_wal_discovery_ctx_t *)cb_ctx; + if (ctx->count >= TDB_WAL_DISCOVERY_MAX) return; + + const size_t prefix_len = sizeof(TDB_UNIFIED_WAL_PREFIX) - 1; + if (strncmp(key, TDB_UNIFIED_WAL_PREFIX, prefix_len) != 0) return; + + const char *num_start = key + prefix_len; + char *end = NULL; + const unsigned long long gen = strtoull(num_start, &end, 10); + if (end && strcmp(end, TDB_WAL_EXT) == 0) + { + ctx->generations[ctx->count++] = (uint64_t)gen; + } +} + +/** + * tdb_replica_replay_single_wal + * replay entries from a single downloaded WAL file into the unified memtable. + * uses sequence numbers for idempotent replay so entries already present are skipped. + * does not write a local WAL since the replica memtable is ephemeral. + * @param db database instance + * @param wal_local local path to the downloaded WAL file + * @param umt unified memtable to replay into + * @param max_seq_inout pointer to max sequence number (updated in place) + * @return number of entries replayed + */ +static int tdb_replica_replay_single_wal(tidesdb_t *db, const char *wal_local, + const tidesdb_memtable_t *umt, uint64_t *max_seq_inout) +{ + block_manager_t *wal = NULL; + if (block_manager_open(&wal, wal_local, TDB_SYNC_NONE) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica WAL replay failed to open %s", wal_local); + tdb_unlink(wal_local); + return 0; + } + + block_manager_cursor_t *cursor = NULL; + if (block_manager_cursor_init(&cursor, wal) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica WAL replay failed to init cursor for %s", wal_local); + block_manager_close(wal); + tdb_unlink(wal_local); + return 0; + } + + uint64_t max_seq = *max_seq_inout; + uint32_t max_cf_index = 0; + int replayed = 0; + + if (block_manager_cursor_goto_first(cursor) == 0) + { + while (1) + { + block_manager_block_t *block = block_manager_cursor_read(cursor); + if (!block) + { + if (block_manager_cursor_skip_corrupt(cursor) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica WAL replay skipped partial write"); + continue; + } + break; + } + + const uint8_t *ptr = block->data; + size_t remaining = block->size; + + /* we skip unified magic */ + if (remaining >= TDB_UNIFIED_WAL_MAGIC_SIZE) + { + uint16_t magic = ((uint16_t)ptr[0] << 8) | ptr[1]; + if (magic == TDB_UNIFIED_WAL_MAGIC) + { + ptr += TDB_UNIFIED_WAL_MAGIC_SIZE; + remaining -= TDB_UNIFIED_WAL_MAGIC_SIZE; + } + } + + while (remaining > TDB_UNIFIED_CF_PREFIX_SIZE) + { + uint32_t cf_index = tdb_decode_be32(ptr); + if (cf_index > max_cf_index) max_cf_index = cf_index; + ptr += TDB_UNIFIED_CF_PREFIX_SIZE; + remaining -= TDB_UNIFIED_CF_PREFIX_SIZE; + + if (remaining < 1) break; + uint8_t flags = *ptr++; + remaining--; + + uint64_t key_size_u64; + int br = decode_varint(ptr, &key_size_u64, (int)remaining); + if (br < 0 || key_size_u64 > UINT32_MAX) break; + ptr += br; + remaining -= br; + + uint64_t value_size_u64; + br = decode_varint(ptr, &value_size_u64, (int)remaining); + if (br < 0 || value_size_u64 > UINT32_MAX) break; + ptr += br; + remaining -= br; + + uint64_t seq_value; + br = decode_varint(ptr, &seq_value, (int)remaining); + if (br < 0) break; + ptr += br; + remaining -= br; + + int64_t ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) break; + ttl = decode_int64_le_compat(ptr); + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + + if (remaining < key_size_u64) break; + const uint8_t *key = ptr; + ptr += key_size_u64; + remaining -= key_size_u64; + + const uint8_t *value = NULL; + if (value_size_u64 > 0) + { + if (remaining < value_size_u64) break; + value = ptr; + ptr += value_size_u64; + remaining -= value_size_u64; + } + + /* skip entries strictly below max_seq; equal-seq entries are + * sibling puts from the same txn (one commit_seq, many keys) + * and must all be applied. skip_list_put_with_seq rejects + * duplicate (key, seq) pairs, so re-replay is harmless. */ + if (seq_value < max_seq) continue; + + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + key_size_u64; + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack_r); + if (!prefixed) break; + tdb_encode_be32(cf_index, prefixed); + memcpy(prefixed + TDB_UNIFIED_CF_PREFIX_SIZE, key, key_size_u64); + + const int is_delete = (flags & TDB_KV_FLAG_TOMBSTONE) ? 1 : 0; + /* preserve the single-delete subtype across replay (mirrors per-CF WAL + * replay) so compaction can still pair-cancel put+single-delete */ + int sl_flags = is_delete ? SKIP_LIST_FLAG_DELETED : 0; + if (is_delete && (flags & TDB_KV_FLAG_SINGLE_DELETE)) + sl_flags |= SKIP_LIST_FLAG_SINGLE_DELETE; + skip_list_put_with_seq( + umt->skip_list, prefixed, pk_total, is_delete ? NULL : (uint8_t *)value, + is_delete ? 0 : (size_t)value_size_u64, ttl, seq_value, sl_flags); + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack_r); + + if (seq_value > max_seq) max_seq = seq_value; + replayed++; + } + + block_manager_block_release(block); + + if (block_manager_cursor_next(cursor) != 0) break; + } + } + + block_manager_cursor_free(cursor); + block_manager_close(wal); + tdb_unlink(wal_local); + + /* we must ensure next_cf_index is past any cf_index seen in the WAL so that + * future CF creation via MANIFEST sync does not collide */ + if (db->unified_mt.enabled && max_cf_index > 0) + { + uint32_t needed = max_cf_index + 1; + uint32_t current = + atomic_load_explicit(&db->unified_mt.next_cf_index, memory_order_relaxed); + while (needed > current) + { + if (atomic_compare_exchange_weak_explicit(&db->unified_mt.next_cf_index, ¤t, + needed, memory_order_relaxed, + memory_order_relaxed)) + break; + } + } + + *max_seq_inout = max_seq; + return replayed; +} + +/** + * tdb_objstore_replay_remote_wals + * discover all unified WAL files in the object store via list(), download and + * replay each one in generation order into the unified memtable. used by replica + * sync for near-real-time reads and by cold-start recovery so a primary rebuilt + * from the object store does not lose committed-but-unflushed writes. derives the + * current generation from the highest discovered WAL. sequence numbers ensure + * idempotent replay -- entries already covered by recovered sstables are skipped. + * @param db database instance with an object store and a unified memtable + * @param cold_start 1 when called from cold-start recovery, 0 from the sync thread (log prefix) + */ +static void tdb_objstore_replay_remote_wals(tidesdb_t *db, int cold_start) +{ + if (!db->unified_mt.enabled || !db->object_store) + { + TDB_DEBUG_LOG(TDB_LOG_DEBUG, "Replica WAL replay skipped (unified=%d, object_store=%p)", + db->unified_mt.enabled, (void *)db->object_store); + return; + } + + tidesdb_memtable_t *umt = atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (!umt || !umt->skip_list) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica WAL replay skipped: unified memtable not ready"); + return; + } + + /* we list all available WAL objects in the object store, with retry -- a transient list + * failure must not be mistaken for "no WALs" and silently skip WAL recovery (mirrors the + * retry in tdb_replica_discover_new_cfs) */ + tdb_wal_discovery_ctx_t discovery = {.count = 0}; + int list_rc = -1; + unsigned int backoff_us = TDB_LIST_INITIAL_BACKOFF_US; + for (int attempt = 0; attempt < TDB_LIST_MAX_RETRIES; attempt++) + { + discovery.count = 0; + list_rc = db->object_store->list(db->object_store->ctx, TDB_UNIFIED_WAL_PREFIX, + tdb_wal_discovery_cb, &discovery); + if (list_rc >= 0) break; + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica WAL replay object store list attempt %d/%d failed", + attempt + 1, TDB_LIST_MAX_RETRIES); + if (attempt + 1 < TDB_LIST_MAX_RETRIES) usleep(backoff_us); + backoff_us *= 2; + } + + if (list_rc < 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Replica WAL replay object store list failed after %d attempts, skipping", + TDB_LIST_MAX_RETRIES); + return; + } + + if (discovery.count == 0) + { + TDB_DEBUG_LOG(TDB_LOG_DEBUG, "Replica WAL replay no WAL files found in object store"); + return; + } + + /* we sort generations ascending for ordered replay */ + for (int i = 0; i < discovery.count - 1; i++) + { + for (int j = i + 1; j < discovery.count; j++) + { + if (discovery.generations[j] < discovery.generations[i]) + { + const uint64_t tmp = discovery.generations[i]; + discovery.generations[i] = discovery.generations[j]; + discovery.generations[j] = tmp; + } + } + } + + /* we derive remote generation from the highest discovered WAL */ + uint64_t remote_gen = discovery.generations[discovery.count - 1]; + uint64_t local_gen = atomic_load_explicit(&db->unified_mt.wal_generation, memory_order_relaxed); + if (remote_gen > local_gen) + { + atomic_store_explicit(&db->unified_mt.wal_generation, remote_gen, memory_order_relaxed); + } + + char wal_local[TDB_MAX_PATH_LEN]; + snprintf(wal_local, sizeof(wal_local), "%s" PATH_SEPARATOR TDB_REPLICA_WAL_TMP, db->db_path); + + /* global_seq is the next seq to assign; max_seq here means the highest + * seq already applied, so derive it by subtracting one (clamped at 0) */ + uint64_t cur_global = atomic_load_explicit(&db->global_seq, memory_order_acquire); + uint64_t max_seq = cur_global > 0 ? cur_global - 1 : 0; + const uint64_t start_max_seq = max_seq; + int total_replayed = 0; + + for (int wi = 0; wi < discovery.count; wi++) + { + char wal_key[TDB_MAX_PATH_LEN]; + snprintf(wal_key, sizeof(wal_key), TDB_UNIFIED_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, + TDB_U64_CAST(discovery.generations[wi])); + + if (db->object_store->get(db->object_store->ctx, wal_key, wal_local) != 0) continue; + + int n = tdb_replica_replay_single_wal(db, wal_local, umt, &max_seq); + total_replayed += n; + } + + /* max_seq is the highest seq applied; global_seq is the next seq to assign, so it + * must reach max_seq + 1. comparing max_seq directly leaves a replica that is one + * step behind stuck, global_seq never advances, the read snapshot (global_seq - 1) + * stays below the newest entry, and just-replayed rows stay invisible while the + * replay re-applies the same tail every tick. */ + const uint64_t next_seq = max_seq + 1; + if (next_seq > atomic_load_explicit(&db->global_seq, memory_order_acquire)) + { + atomic_store_explicit(&db->global_seq, next_seq, memory_order_release); + } + + /* the boundary entry at seq == start_max_seq is re-applied every tick (idempotent), so + * total_replayed alone is not progress -- log only when max_seq actually advanced. */ + if (max_seq > start_max_seq) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "%s Replica WAL replay advanced to max_seq=%" PRIu64 " (%d entries, %d WALs)", + cold_start ? "[cold-start]" : "[sync]", max_seq, total_replayed, + discovery.count); + } +} + +/** + * tdb_cf_discovery_ctx_t + * context for cold-start CF discovery from object store + * @param cf_names array of discovered column family names + * @param count number of CFs discovered so far + */ +typedef struct +{ + char cf_names[TDB_MAX_CF_DISCOVERY][TDB_MAX_CF_NAME_LEN]; /* discovered CF names */ + int count; +} tdb_cf_discovery_ctx_t; + +/** + * tdb_cf_discovery_cb + * list callback that extracts CF names from MANIFEST object keys + * @param key object key from the list operation + * @param size object size in bytes (unused) + * @param cb_ctx pointer to tdb_cf_discovery_ctx_t + */ +static void tdb_cf_discovery_cb(const char *key, const size_t size, void *cb_ctx) +{ + (void)size; + tdb_cf_discovery_ctx_t *ctx = (tdb_cf_discovery_ctx_t *)cb_ctx; + + /* we look for MANIFEST files-- "cf_name/MANIFEST" */ + const char *manifest_suffix = "/" TDB_COLUMN_FAMILY_MANIFEST_NAME; + const size_t key_len = strlen(key); + const size_t suffix_len = strlen(manifest_suffix); + + if (key_len > suffix_len && strcmp(key + key_len - suffix_len, manifest_suffix) == 0) + { + if (ctx->count >= TDB_MAX_CF_DISCOVERY) return; + + /* we extract CF name (everything before "/MANIFEST") */ + const size_t cf_len = key_len - suffix_len; + if (cf_len >= TDB_MAX_CF_NAME_LEN) return; + + memcpy(ctx->cf_names[ctx->count], key, cf_len); + ctx->cf_names[ctx->count][cf_len] = '\0'; + ctx->count++; + } +} + +/** + * tdb_cold_start_download_arg_t + * thread argument for parallel cold start CF metadata downloads + * @param db database instance + * @param cf_name column family name to download + */ +typedef struct +{ + tidesdb_t *db; + const char *cf_name; +} tdb_cold_start_download_arg_t; + +/** + * tdb_cold_start_download_worker + * download config.ini + MANIFEST for a single CF (runs on a worker thread) + * @param arg pointer to tdb_cold_start_download_arg_t + * @return NULL + */ +static void *tdb_cold_start_download_worker(void *arg) +{ + tdb_cold_start_download_arg_t *ctx = (tdb_cold_start_download_arg_t *)arg; + const tidesdb_t *db = ctx->db; + const char *cf_name = ctx->cf_name; + + /* we create local CF directory (leave room for /config.ini and /MANIFEST suffixes) */ + char cf_dir[TDB_MAX_PATH_LEN - TDB_PATH_SUFFIX_RESERVE]; + snprintf(cf_dir, sizeof(cf_dir), "%s" PATH_SEPARATOR "%s", db->db_path, cf_name); + mkdir(cf_dir, TDB_DIR_PERMISSIONS); + tdb_sync_directory(db->db_path); + + /* we download config.ini */ + char config_key[TDB_MAX_PATH_LEN]; + snprintf(config_key, sizeof(config_key), + "%s/" TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, cf_name); + char config_local[TDB_MAX_PATH_LEN]; + snprintf(config_local, sizeof(config_local), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + cf_dir); + if (db->object_store->get(db->object_store->ctx, config_key, config_local) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Object store cold start failed to download config for CF '%s'", + cf_name); + } + + /* we download MANIFEST */ + char manifest_key[TDB_MAX_PATH_LEN]; + snprintf(manifest_key, sizeof(manifest_key), "%s/" TDB_COLUMN_FAMILY_MANIFEST_NAME, cf_name); + char manifest_local[TDB_MAX_PATH_LEN]; + snprintf(manifest_local, sizeof(manifest_local), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_MANIFEST_NAME, cf_dir); + if (db->object_store->get(db->object_store->ctx, manifest_key, manifest_local) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Object store cold start failed to download MANIFEST for CF '%s'", cf_name); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Object store cold start downloaded config + MANIFEST for CF '%s'", + cf_name); + + return NULL; +} + +/** + * tdb_objstore_cold_start_discover + * on cold start (no local CF directories), discover CFs from the object store + * by listing MANIFEST objects, then download config.ini + MANIFEST for each + * in parallel. the actual sstable data is not downloaded -- it will be fetched + * on demand via tidesdb_sstable_ensure_open when queries arrive. + * @param db database instance with object_store configured + */ +static void tdb_objstore_cold_start_discover(tidesdb_t *db) +{ + if (!db->object_store) return; + + /* we list all objects to find CF names via their MANIFEST files */ + tdb_cf_discovery_ctx_t discovery = {.count = 0}; + int list_rc = + db->object_store->list(db->object_store->ctx, "", tdb_cf_discovery_cb, &discovery); + if (list_rc < 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Object store cold start list failed (rc=%d)", list_rc); + return; + } + + if (discovery.count == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Object store cold start no CFs found in remote store"); + return; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Object store cold start discovered %d CFs in remote store", + discovery.count); + + /* we download config + MANIFEST for all CFs in parallel */ + tdb_cold_start_download_arg_t args[TDB_MAX_CF_DISCOVERY]; + pthread_t threads[TDB_MAX_CF_DISCOVERY]; + int launched = 0; + + for (int i = 0; i < discovery.count && i < TDB_MAX_CF_DISCOVERY; i++) + { + args[i].db = db; + args[i].cf_name = discovery.cf_names[i]; + if (pthread_create(&threads[launched], NULL, tdb_cold_start_download_worker, &args[i]) == 0) + { + launched++; + } + else + { + /* we fallback to synchronous download if thread creation fails */ + tdb_cold_start_download_worker(&args[i]); + } + } + + for (int i = 0; i < launched; i++) + { + pthread_join(threads[i], NULL); + } +} + +/** + * tdb_replica_discover_new_cfs + * discover column families in the object store that do not exist locally + * and create them. uses the same list() + MANIFEST key pattern as cold start + * discovery but runs during periodic replica sync so new CFs created by the + * primary after the replica started are picked up. + * @param db database instance in replica mode + */ +static void tdb_replica_discover_new_cfs(tidesdb_t *db) +{ + if (!db->object_store) return; + + tdb_cf_discovery_ctx_t discovery = {.count = 0}; + int list_rc = -1; + unsigned int backoff_us = TDB_LIST_INITIAL_BACKOFF_US; + + for (int attempt = 0; attempt < TDB_LIST_MAX_RETRIES; attempt++) + { + discovery.count = 0; + list_rc = + db->object_store->list(db->object_store->ctx, "", tdb_cf_discovery_cb, &discovery); + if (list_rc >= 0) break; + + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica sync object store list attempt %d/%d failed", + attempt + 1, TDB_LIST_MAX_RETRIES); + if (attempt + 1 < TDB_LIST_MAX_RETRIES) usleep(backoff_us); + backoff_us *= 2; + } + + if (list_rc < 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Replica sync object store list failed after %d attempts, skipping discovery", + TDB_LIST_MAX_RETRIES); + return; + } + + /* the primary uploads UNIMAP whenever it adds a cf, so we re-sync it before + * creating any newly discovered cf -- otherwise the replica would assign + * its own index and diverge from the primary's unified wal */ + if (db->unified_mt.enabled) + { + tidesdb_unimap_objstore_pull(db, 1); + tidesdb_unimap_load(db); + } + + for (int i = 0; i < discovery.count; i++) + { + const char *cf_name = discovery.cf_names[i]; + + /* we skip CFs that already exist locally */ + pthread_rwlock_rdlock(&db->cf_list_lock); + const tidesdb_column_family_t *existing = tidesdb_get_column_family_internal(db, cf_name); + pthread_rwlock_unlock(&db->cf_list_lock); + if (existing) continue; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync discovered new CF '%s' in object store", cf_name); + + /* we download config.ini */ + char cf_dir[TDB_MAX_PATH_LEN]; + snprintf(cf_dir, sizeof(cf_dir), "%s" PATH_SEPARATOR "%s", db->db_path, cf_name); + mkdir(cf_dir, TDB_DIR_PERMISSIONS); + tdb_sync_directory(db->db_path); + + char config_key[TDB_MAX_PATH_LEN]; + snprintf(config_key, sizeof(config_key), + "%s/" TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, cf_name); + char config_local[TDB_MAX_PATH_LEN]; +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif + snprintf(config_local, sizeof(config_local), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + cf_dir); +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + + tidesdb_column_family_config_t cf_config = tidesdb_default_column_family_config(); + if (db->object_store->get(db->object_store->ctx, config_key, config_local) == 0) + { + tidesdb_cf_config_load_from_ini(config_local, cf_name, &cf_config); + } + + /* we download MANIFEST so the sync loop can process it */ + char manifest_key[TDB_MAX_PATH_LEN]; + snprintf(manifest_key, sizeof(manifest_key), "%s/" TDB_COLUMN_FAMILY_MANIFEST_NAME, + cf_name); + char manifest_local[TDB_MAX_PATH_LEN]; +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif + snprintf(manifest_local, sizeof(manifest_local), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_MANIFEST_NAME, cf_dir); +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + db->object_store->get(db->object_store->ctx, manifest_key, manifest_local); + + /* we temporarily clear replica_mode so tidesdb_create_column_family + * does not reject the call with TDB_ERR_READONLY. this is safe because + * we are the reaper thread creating a CF that the primary already wrote + * to the object store, not a user-initiated write. */ + int was_replica = atomic_exchange(&db->replica_mode, 0); + int rc = tidesdb_create_column_family(db, cf_name, &cf_config); + if (was_replica) atomic_store(&db->replica_mode, 1); + + if (rc == TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync created new CF '%s'", cf_name); + } + else if (rc != TDB_ERR_EXISTS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Replica sync failed to create CF '%s' (err=%d)", cf_name, + rc); + } + } +} + +/** + * tdb_objstore_delete_listed_cb + * list callback that deletes each enumerated object during CF drop + * @param key object key to delete + * @param size object size in bytes (unused) + * @param cb_ctx pointer to tidesdb_objstore_t connector + */ +static void tdb_objstore_delete_listed_cb(const char *key, const size_t size, void *cb_ctx) +{ + (void)size; + const tidesdb_objstore_t *store = (tidesdb_objstore_t *)cb_ctx; + + /* retry with backoff and log on exhaustion, mirroring tdb_objstore_delete_file -- a + * single ignored delete during CF drop silently leaves orphaned remote objects */ + unsigned int delay_us = TDB_UPLOAD_INITIAL_BACKOFF_US; + for (int attempt = 0; attempt < TDB_UPLOAD_MAX_RETRIES; attempt++) + { + if (store->delete_object(store->ctx, key) == 0) return; + + TDB_DEBUG_LOG(TDB_LOG_WARN, "Object store delete attempt %d/%d failed during drop: %s", + attempt + 1, TDB_UPLOAD_MAX_RETRIES, key); + if (attempt + 1 < TDB_UPLOAD_MAX_RETRIES) usleep(delay_us); + delay_us *= TDB_UPLOAD_BACKOFF_MULTIPLIER; + } + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Object store delete failed after %d attempts during drop: %s", + TDB_UPLOAD_MAX_RETRIES, key); +} + +/** + * tidesdb_sstable_ensure_klog_open + * ensures an sstable's klog block manager is open. num_open_sstables is keyed on the + * klog, it is incremented here when the klog transitions closed->open and decremented + * when the reaper (or a cleanup path) closes the klog. the vlog is opened lazily and is + * not separately counted, so a scan that touches only inline values holds one fd per + * pinned sstable instead of two. + * @param db database instance + * @param sst sstable whose klog to ensure open + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_ensure_klog_open(tidesdb_t *db, tidesdb_sstable_t *sst) +{ + if (!db || !sst) return -1; + if (!sst->config || !sst->klog_path) return -1; + + if (sst->klog_bm) + { + atomic_store(&sst->last_access_time, atomic_load(&db->cached_current_time)); + return 0; /* already open */ + } + + if (db->object_store) + { + if (tdb_objstore_download_if_missing(db, sst->klog_path) != 0) return -1; + } + + block_manager_t *new_klog_bm = NULL; + if (tidesdb_bm_open(db, &new_klog_bm, sst->klog_path, + convert_sync_mode(sst->config->sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : sst->config->sync_mode)) != 0) + { + if (tdb_log_throttle(db, &db->last_open_fail_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_ERROR, "failed to open sstable klog '%s': %s%s", sst->klog_path, + strerror(errno), + (errno == EMFILE || errno == ENFILE) + ? " -- open-file limit reached; raise ulimit -n" + : ""); + return -1; + } + + /* CAS to set klog_bm -- if another thread already set it, close ours and let that + * thread's CAS win own the num_open_sstables increment (exactly one inc per open) */ + block_manager_t *expected = NULL; + if (!atomic_compare_exchange_strong(&sst->klog_bm, &expected, new_klog_bm)) + { + block_manager_close(new_klog_bm); + return 0; + } + + atomic_store(&sst->last_access_time, atomic_load(&db->cached_current_time)); + atomic_fetch_add(&db->num_open_sstables, 1); + return 0; +} + +/** + * tidesdb_sstable_ensure_vlog_open + * ensures an sstable's vlog block manager is open. the vlog is opened lazily on the first + * value read that misses the inline klog payload; it is not counted in num_open_sstables + * (see tidesdb_sstable_ensure_klog_open) and is closed alongside the klog by the reaper. + * @param db database instance + * @param sst sstable whose vlog to ensure open + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_ensure_vlog_open(tidesdb_t *db, tidesdb_sstable_t *sst) +{ + if (!db || !sst) return -1; + if (!sst->config || !sst->vlog_path) return -1; + + if (sst->vlog_bm) return 0; /* already open */ + + if (db->object_store) + { + if (tdb_objstore_download_if_missing(db, sst->vlog_path) != 0) return -1; + } + + block_manager_t *new_vlog_bm = NULL; + if (tidesdb_bm_open(db, &new_vlog_bm, sst->vlog_path, + convert_sync_mode(sst->config->sync_mode)) != 0) + { + if (tdb_log_throttle(db, &db->last_open_fail_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_ERROR, "failed to open sstable vlog '%s': %s%s", sst->vlog_path, + strerror(errno), + (errno == EMFILE || errno == ENFILE) + ? " -- open-file limit reached; raise ulimit -n" + : ""); + return -1; + } + + /* we hint that vlog access is random (point lookups by offset) + * this disables read-ahead which would waste I/O for random access */ + set_file_random_hint(new_vlog_bm->fd); + + /* CAS to set vlog_bm -- if another thread already set it, we close ours */ + block_manager_t *expected = NULL; + if (!atomic_compare_exchange_strong(&sst->vlog_bm, &expected, new_vlog_bm)) + { + block_manager_close(new_vlog_bm); + } + + return 0; +} + +/** + * tidesdb_sstable_ensure_open + * ensures both block managers are open. used by write/flush/compaction/btree paths that + * need the vlog eagerly; scan sources open the klog only (tidesdb_sstable_ensure_klog_open) + * and let tidesdb_vlog_read_value open the vlog on demand. + * @param db database instance + * @param sst sstable to ensure is open + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_ensure_open(tidesdb_t *db, tidesdb_sstable_t *sst) +{ + if (tidesdb_sstable_ensure_klog_open(db, sst) != 0) return -1; + if (tidesdb_sstable_ensure_vlog_open(db, sst) != 0) return -1; + return 0; +} + +/** + * tidesdb_sstable_create + * create a new sstable + * @param db database instance + * @param base_path base path for sstable files + * @param id sstable id + * @param config column family configuration + * @return sstable on success, NULL on failure + */ +static tidesdb_sstable_t *tidesdb_sstable_create(tidesdb_t *db, const char *base_path, + const uint64_t id, + const tidesdb_column_family_config_t *config) +{ + if (!db || !base_path || !config) return NULL; + + tidesdb_sstable_t *sst = calloc(1, sizeof(tidesdb_sstable_t)); + if (!sst) return NULL; + + sst->db = db; + sst->config = malloc(sizeof(tidesdb_column_family_config_t)); + if (!sst->config) + { + free(sst); + return NULL; + } + memcpy(sst->config, config, sizeof(tidesdb_column_family_config_t)); + + sst->id = id; + atomic_init(&sst->refcount, 1); + sst->num_klog_blocks = 0; + sst->num_vlog_blocks = 0; + sst->klog_data_end_offset = 0; + atomic_init(&sst->marked_for_deletion, 0); + atomic_init(&sst->last_access_time, 0); + sst->klog_bm = NULL; + sst->vlog_bm = NULL; + sst->use_btree = config->use_btree; + + /* we cache resolved comparator on the sstable to avoid per-lookup resolution */ + sst->cached_comparator_fn = NULL; + sst->cached_comparator_ctx = NULL; + sst->is_reverse = 0; + tidesdb_resolve_comparator(db, config, &sst->cached_comparator_fn, &sst->cached_comparator_ctx); + + const size_t path_len = strlen(base_path) + TDB_PATH_SUFFIX_RESERVE; + sst->klog_path = malloc(path_len); + sst->vlog_path = malloc(path_len); + + if (!sst->klog_path || !sst->vlog_path) + { + free(sst->klog_path); + free(sst->vlog_path); + free(sst->config); + free(sst); + return NULL; + } + + snprintf(sst->klog_path, path_len, "%s_" TDB_U64_FMT TDB_SSTABLE_KLOG_EXT, base_path, + TDB_U64_CAST(id)); + snprintf(sst->vlog_path, path_len, "%s_" TDB_U64_FMT TDB_SSTABLE_VLOG_EXT, base_path, + TDB_U64_CAST(id)); + + /* we use XXH64 of the klog path as the btree node cache key prefix. + * this is globally unique across CFs (includes CF directory + sstable id), + * unlike sst->id which is per-CF and can collide in the shared node cache. */ + sst->cache_key_prefix = XXH64(sst->klog_path, strlen(sst->klog_path), 0); + + /* we cache CF name from path to avoid repeated parsing during reads */ + if (tidesdb_get_cf_name_from_path(sst->klog_path, sst->cf_name) != 0) + { + sst->cf_name[0] = '\0'; /* fall back to empty string if extraction fails */ + } + + /* we cache filename pointer into klog_path to avoid strrchr on every read */ + { + const char *last_fwd = strrchr(sst->klog_path, '/'); + const char *last_back = strrchr(sst->klog_path, '\\'); + const char *last_sep = (last_fwd > last_back) ? last_fwd : last_back; + sst->klog_filename = last_sep ? last_sep + 1 : sst->klog_path; + } + + return sst; +} + +/** + * tidesdb_invalidate_btree_cache_for_sstable + * invalidate all btree node cache entries for a specific sstable. the prefix is built + * via btree_format_cache_key_prefix off sst->cache_key_prefix so it matches exactly + * what btree_node_read_cached produces; using sst->id here would never match because + * the producer encodes a different value (XXH64 of klog_path) in a different base (hex). + * @param db the database + * @param sst the sstable being freed + */ +static void tidesdb_invalidate_btree_cache_for_sstable(tidesdb_t *db, const tidesdb_sstable_t *sst) +{ + if (!db || !db->btree_node_cache || !sst) return; + + char prefix[BTREE_CACHE_KEY_SIZE]; + const int prefix_len = btree_format_cache_key_prefix(sst->cache_key_prefix, prefix); + clock_cache_delete_by_prefix(db->btree_node_cache, prefix, (size_t)prefix_len); +} + +/** + * tidesdb_invalidate_block_cache_for_cf + * invalidate all block cache entries for a column family + * @param db the database + * @param cf_name column family name + */ +static void tidesdb_invalidate_block_cache_for_cf(tidesdb_t *db, const char *cf_name) +{ + if (!db || !db->clock_cache || !cf_name) return; + + char prefix[TDB_MAX_CF_NAME_LEN + 2]; + const int prefix_len = snprintf(prefix, sizeof(prefix), "%s:", cf_name); + if (prefix_len <= 0 || (size_t)prefix_len >= sizeof(prefix)) return; + + clock_cache_delete_by_prefix(db->clock_cache, prefix, (size_t)prefix_len); +} + +/** + * tidesdb_sstable_free + * free an sstable + * @param sst sstable to free + */ +static void tidesdb_sstable_free(tidesdb_sstable_t *sst) +{ + if (!sst) return; + + /* we invalidate btree node cache entries for this sstable before freeing */ + if (sst->use_btree && sst->db && sst->db->btree_node_cache) + { + tidesdb_invalidate_btree_cache_for_sstable(sst->db, sst); + } + + /* we skip eager block cache invalidation here. the cache entries for this + * sstable are already dead -- klog filenames include the monotonic SST ID + * so no future lookup will construct their cache key. the clock sweep + * reclaims them naturally when it needs space (dead entries have no readers + * so their ref_bit stays clear, making them the first eviction victims). + * removing the O(total_slots) prefix scan eliminates atomic contention + * between compaction and concurrent iterators on the cache ref_bit. */ + + /* if marked for deletion, evict file data from page cache before closing + * this prevents cache pollution from compacted-away sstables */ + if (atomic_load_explicit(&sst->marked_for_deletion, memory_order_acquire)) + { + if (sst->klog_bm) + { + evict_file_region(sst->klog_bm->fd, 0, 0); + } + if (sst->vlog_bm) + { + evict_file_region(sst->vlog_bm->fd, 0, 0); + } + } + + { + /* num_open_sstables is keyed on the klog (the vlog is opened lazily and not + * separately counted), so the decrement fires iff the klog was open */ + const int had_open_bms = (sst->klog_bm != NULL); + if (sst->klog_bm) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + } + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + if (had_open_bms && sst->db) + { + atomic_fetch_sub(&sst->db->num_open_sstables, 1); + } + } + + /* we delete files only when refcount reaches 0 + * this ensures active transactions can still read from old sstables + * during compaction */ + if (atomic_load_explicit(&sst->marked_for_deletion, memory_order_acquire)) + { + /* we delete from object store before local unlink */ + if (sst->db && sst->db->object_store) + { + tdb_objstore_delete_file(sst->db, sst->klog_path); + tdb_objstore_delete_file(sst->db, sst->vlog_path); + } + if (sst->db && sst->db->local_cache) + { + tdb_local_cache_remove(sst->db->local_cache, sst->klog_path); + tdb_local_cache_remove(sst->db->local_cache, sst->vlog_path); + } + tdb_unlink(sst->klog_path); + tdb_unlink(sst->vlog_path); + + /* we sync the parent directory to persist the unlink operations */ + if (sst->klog_path) + { + char dir_buf[TDB_MAX_PATH_LEN]; + strncpy(dir_buf, sst->klog_path, sizeof(dir_buf) - 1); + dir_buf[sizeof(dir_buf) - 1] = '\0'; + char *sep = strrchr(dir_buf, '/'); +#ifdef _WIN32 + if (!sep) sep = strrchr(dir_buf, '\\'); +#endif + if (sep) + { + *sep = '\0'; + tdb_sync_directory(dir_buf); + } + } + } + + free(sst->klog_path); + free(sst->vlog_path); + free(sst->min_key); + free(sst->max_key); + free(sst->config); + + if (sst->bloom_filter) bloom_filter_free(sst->bloom_filter); + if (sst->block_indexes) compact_block_index_free(sst->block_indexes); + + free(sst); +} + +/** + * tidesdb_sstable_ref + * increment reference count of an sstable + * @param sst sstable to reference + */ +static void tidesdb_sstable_ref(tidesdb_sstable_t *sst) +{ + if (sst) + { + atomic_fetch_add(&sst->refcount, 1); + } +} + +/** + * tidesdb_sstable_try_ref + * try to increment reference count of an sstable using CAS + * this is safe to call on an sstable that might be concurrently freed + * @param sst sstable to reference + * @return 1 if reference was acquired, 0 if sstable is being freed (refcount was 0) + */ +static int tidesdb_sstable_try_ref(tidesdb_sstable_t *sst) +{ + if (!sst) return 0; + + /* we use CAS loop to only increment if refcount > 0 + * if refcount is 0, the sstable is being freed and we must not touch it + * if refcount < 0 (TDB_REFCOUNT_EVICTING), the reaper is closing block + * managers -- we spin briefly until it finishes and restores refcount */ + int old_refcount = atomic_load_explicit(&sst->refcount, memory_order_acquire); + int evict_spins = 0; + for (;;) + { + if (old_refcount > 0) + { + if (atomic_compare_exchange_weak_explicit(&sst->refcount, &old_refcount, + old_refcount + 1, memory_order_acq_rel, + memory_order_acquire)) + { + return 1; /* successfully acquired reference */ + } + /* CAS failed, old_refcount was updated, continue loop */ + } + else if (old_refcount == 0) + { + return 0; /* refcount was 0, sstable is being freed */ + } + else + { + /* reaper is closing a still-live sstable's block managers + * and restores the refcount when done. wait it out with escalating backoff -- + * returning 0 here is indistinguishable from a freed sstable and would make a + * reader skip a present sstable (false NOT_FOUND). bounded close is microseconds. */ + if (++evict_spins < TDB_REFCOUNT_DRAIN_SPIN_THRESHOLD) + cpu_pause(); + else if (evict_spins < TDB_REFCOUNT_DRAIN_YIELD_THRESHOLD) + cpu_yield(); + else if (evict_spins < TDB_EVICT_WAIT_MAX) + usleep(TDB_REFCOUNT_DRAIN_SLEEP_US); + else + return 0; /* reaper stuck far past any close -- caller backs off and retries */ + old_refcount = atomic_load_explicit(&sst->refcount, memory_order_acquire); + } + } +} + +/** + * tidesdb_sstable_unref + * decrement reference count of an sstable + * @param db database instance + * @param sst sstable to unreference + */ +static void tidesdb_sstable_unref(const tidesdb_t *db, tidesdb_sstable_t *sst) +{ + (void)db; + if (!sst) return; + const int old_refcount = atomic_fetch_sub(&sst->refcount, 1); + if (old_refcount == 1) + { + tidesdb_sstable_free(sst); + } +} + +/** + * tidesdb_flush_memtable_internal + * rotates the active memtable and enqueues the old one for flush to disk + * @param cf column family + * @param already_holds_lock 1 if caller already holds is_flushing lock + * @param force 1 to flush regardless of size threshold + * @return TDB_SUCCESS or error code + */ +static int tidesdb_flush_memtable_internal(tidesdb_column_family_t *cf, int already_holds_lock, + int force); + +/** + * tidesdb_write_set_hash_t + * hash table for O(1) write set lookups in large transactions + * uses open addressing with linear probing for cache locality + * @param slots maps hash -> ops index, -1 if empty + * @param capacity always TDB_WRITE_SET_HASH_CAPACITY + */ +typedef struct +{ + int *slots; + int capacity; +} tidesdb_write_set_hash_t; + +/** + * tidesdb_write_set_hash_create + * create hash table for write set + * @return hash table on success, NULL on failure + */ +static tidesdb_write_set_hash_t *tidesdb_write_set_hash_create(void) +{ + tidesdb_write_set_hash_t *hash = malloc(sizeof(tidesdb_write_set_hash_t)); + if (!hash) return NULL; + + hash->capacity = TDB_WRITE_SET_HASH_CAPACITY; + hash->slots = malloc(hash->capacity * sizeof(int)); + if (!hash->slots) + { + free(hash); + return NULL; + } + + for (int i = 0; i < hash->capacity; i++) + { + hash->slots[i] = TDB_WRITE_SET_HASH_EMPTY; + } + + return hash; +} + +/** + * tidesdb_write_set_hash_free + * free hash table + */ +static void tidesdb_write_set_hash_free(tidesdb_write_set_hash_t *hash) +{ + if (!hash) return; + free(hash->slots); + free(hash); +} + +/** + * tidesdb_write_set_hash_key + * compute hash for key+cf combination using xxhash + * @param cf column family + * @param key key + * @param key_size key size + * @return hash value + */ +static uint32_t tidesdb_write_set_hash_key(tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size) +{ + /* we mix CF pointer into seed for better distribution across CFs */ + const uint64_t seed = TDB_TXN_HASH_SEED ^ (uint64_t)(uintptr_t)cf; + return (uint32_t)XXH64(key, key_size, seed); +} + +/** + * tidesdb_write_set_hash_insert + * insert operation index into hash table + * overwrites existing entry for same key (keeps newest) + * @param hash hash table + * @param txn transaction + * @param op_index operation index + */ +static void tidesdb_write_set_hash_insert(tidesdb_write_set_hash_t *hash, const tidesdb_txn_t *txn, + const int op_index) +{ + if (!hash || op_index < 0 || op_index >= txn->num_ops) return; + + const tidesdb_txn_op_t *op = &txn->ops[op_index]; + const uint32_t h = tidesdb_write_set_hash_key(op->cf, op->key, op->key_size); + int slot = (int)(h % (uint32_t)hash->capacity); + + /* we utilize linear probing to find empty slot or matching key */ + for (int probe = 0; probe < TDB_TXN_MAX_PROBE_LENGTH; probe++) + { + const int existing_idx = hash->slots[slot]; + + if (existing_idx == TDB_WRITE_SET_HASH_EMPTY) + { + /* empty slot, insert here */ + hash->slots[slot] = op_index; + return; + } + + /* we check if this slot has the same key (update case) */ + const tidesdb_txn_op_t *existing = &txn->ops[existing_idx]; + if (existing->cf == op->cf && existing->key_size == op->key_size && + memcmp(existing->key, op->key, op->key_size) == 0) + { + /* same key, we update to newer operation */ + hash->slots[slot] = op_index; + return; + } + + /* collision, try next slot */ + slot = (slot + 1) % hash->capacity; + } + /* probe limit exceeded--hash table may be too full, but continue without hash */ +} + +/** + * tidesdb_write_set_hash_lookup + * find operation index for given key+cf + * @param hash hash table + * @param txn transaction + * @param cf column family + * @param key key + * @param key_size key size + * @return operation index if found, -1 if not found + */ +static int tidesdb_write_set_hash_lookup(tidesdb_write_set_hash_t *hash, const tidesdb_txn_t *txn, + tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size) +{ + if (!hash) return -1; + + const uint32_t h = tidesdb_write_set_hash_key(cf, key, key_size); + int slot = (int)(h % (uint32_t)hash->capacity); + + /* we utilize linear probing to find key */ + for (int probe = 0; probe < TDB_TXN_MAX_PROBE_LENGTH; probe++) + { + const int op_index = hash->slots[slot]; + + if (op_index == TDB_WRITE_SET_HASH_EMPTY) + { + /* empty slot means key not in hash */ + return -1; + } + + const tidesdb_txn_op_t *op = &txn->ops[op_index]; + if (op->cf == cf && op->key_size == key_size && memcmp(op->key, key, key_size) == 0) + { + /* found it */ + return op_index; + } + + /* collision, we try next slot */ + slot = (slot + 1) % hash->capacity; + } + + /* probe limit exceeded--assume not found */ + return -1; +} + +/** + * tidesdb_read_set_hash_t + * hash table for O(1) read set lookups in SSI conflict detection + * uses xxhash for better distribution and larger capacity for fewer collisions + * @param slots maps hash -> read_set index, -1 if empty + * @param capacity always TDB_READ_SET_HASH_CAPACITY + */ +typedef struct +{ + int *slots; + int capacity; +} tidesdb_read_set_hash_t; + +/** + * tidesdb_read_set_hash_create + * create hash table for read set + */ +static tidesdb_read_set_hash_t *tidesdb_read_set_hash_create(void) +{ + tidesdb_read_set_hash_t *hash = malloc(sizeof(tidesdb_read_set_hash_t)); + if (!hash) return NULL; + + hash->capacity = TDB_READ_SET_HASH_CAPACITY; + hash->slots = malloc(hash->capacity * sizeof(int)); + if (!hash->slots) + { + free(hash); + return NULL; + } + + for (int i = 0; i < hash->capacity; i++) + { + hash->slots[i] = TDB_READ_SET_HASH_EMPTY; + } + + return hash; +} + +/** + * tidesdb_read_set_hash_free + * free hash table + * @param hash hash table to free + */ +static void tidesdb_read_set_hash_free(tidesdb_read_set_hash_t *hash) +{ + if (!hash) return; + free(hash->slots); + free(hash); +} + +/** + * tidesdb_read_set_hash_key + * compute hash for key+cf combination using xxhash + * @param cf column family + * @param key key + * @param key_size key size + * @return hash value + */ +static uint32_t tidesdb_read_set_hash_key(tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size) +{ + /* mix CF pointer into seed for better distribution across CFs */ + const uint64_t seed = TDB_TXN_HASH_SEED ^ (uint64_t)(uintptr_t)cf; + return (uint32_t)XXH64(key, key_size, seed); +} + +/** + * tidesdb_read_set_hash_insert + * insert read set index into hash table + * @param hash hash table + * @param txn transaction + * @param read_index read set index + */ +static void tidesdb_read_set_hash_insert(tidesdb_read_set_hash_t *hash, const tidesdb_txn_t *txn, + const int read_index) +{ + if (!hash || read_index < 0 || read_index >= txn->read_set_count) return; + + const uint32_t h = tidesdb_read_set_hash_key( + txn->read_cfs[read_index], txn->read_keys[read_index], txn->read_key_sizes[read_index]); + int slot = (int)(h % (uint32_t)hash->capacity); + + /* linear probing to find empty slot or matching key */ + for (int probe = 0; probe < TDB_TXN_MAX_PROBE_LENGTH; probe++) + { + const int existing_idx = hash->slots[slot]; + + if (existing_idx == TDB_READ_SET_HASH_EMPTY) + { + /* empty slot, insert here */ + hash->slots[slot] = read_index; + return; + } + + /* we check if this slot has the same key (update case) */ + if (txn->read_cfs[existing_idx] == txn->read_cfs[read_index] && + txn->read_key_sizes[existing_idx] == txn->read_key_sizes[read_index] && + memcmp(txn->read_keys[existing_idx], txn->read_keys[read_index], + txn->read_key_sizes[read_index]) == 0) + { + /* same key,we update to newer read */ + hash->slots[slot] = read_index; + return; + } + + /* collision, we try next slot */ + slot = (slot + 1) % hash->capacity; + } + /* probe limit exceeded -- hash table may be too full, but continue without hash */ +} + +/** + * tidesdb_read_set_hash_check_conflict + * check if a write key conflicts with any read in the hash table + * @param hash hash table + * @param txn transaction + * @param cf column family + * @param key key + * @param key_size key size + * @return 1 if conflict found, 0 otherwise + */ +static int tidesdb_read_set_hash_check_conflict(tidesdb_read_set_hash_t *hash, + const tidesdb_txn_t *txn, + tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size) +{ + if (!hash) return 0; + + if (txn == NULL || cf == NULL || key == NULL || key_size == 0) return 0; + + const uint32_t h = tidesdb_read_set_hash_key(cf, key, key_size); + int slot = (int)(h % (uint32_t)hash->capacity); + + /* we use linear probing to find key */ + for (int probe = 0; probe < TDB_TXN_MAX_PROBE_LENGTH; probe++) + { + const int read_index = hash->slots[slot]; + + if (read_index == TDB_READ_SET_HASH_EMPTY) + { + /* empty slot means key not in hash */ + return 0; + } + + if (txn->read_cfs[read_index] == cf && txn->read_key_sizes[read_index] == key_size && + memcmp(txn->read_keys[read_index], key, key_size) == 0) + { + /* found conflict */ + return 1; + } + + /* collision, we try next slot */ + slot = (slot + 1) % hash->capacity; + } + + /* probe limit exceeded -- assume no conflict (conservative) */ + return 0; +} + +/** + * tidesdb_immutable_memtable_ref + * increment reference count of an immutable memtable + * @param imm immutable memtable to reference + */ +static void tidesdb_immutable_memtable_ref(tidesdb_immutable_memtable_t *imm) +{ + if (imm) atomic_fetch_add(&imm->refcount, 1); +} + +/** + * tidesdb_immutable_memtable_try_ref + * try to increment reference count using CAS -- fails if refcount is 0 + * this prevents resurrecting an immutable whose cleanup has already been claimed + * @param imm immutable memtable to reference + * @return 1 if reference was acquired, 0 if refcount was 0 (claimed for cleanup) + */ +static int tidesdb_immutable_memtable_try_ref(tidesdb_immutable_memtable_t *imm) +{ + if (!imm) return 0; + + int old = atomic_load_explicit(&imm->refcount, memory_order_acquire); + for (;;) + { + if (old <= 0) return 0; + if (atomic_compare_exchange_weak_explicit(&imm->refcount, &old, old + 1, + memory_order_acq_rel, memory_order_acquire)) + { + return 1; + } + } +} + +/** + * tidesdb_immutable_memtable_unref + * decrement reference count of an immutable memtable + * @param imm immutable memtable to unreference + */ +static void tidesdb_immutable_memtable_unref(tidesdb_immutable_memtable_t *imm) +{ + if (!imm) return; + if (atomic_fetch_sub(&imm->refcount, 1) == 1) + { + skip_list_t *memtable_to_free = imm->skip_list; + if (imm->wal) block_manager_close(imm->wal); + free(imm); + + if (memtable_to_free) + { + skip_list_free(memtable_to_free); + } + } +} + +/** + * tidesdb_memtable_try_ref + * try to increment reference count of a memtable using CAS + * this is safe to call on a memtable that might be concurrently freed + * (e.g. the active memtable which can rotate to immutable and get cleaned up) + * @param mt memtable to reference + * @return 1 if reference was acquired, 0 if memtable is being freed (refcount was 0) + */ +static int tidesdb_memtable_try_ref(tidesdb_memtable_t *mt) +{ + if (!mt) return 0; + + int old = atomic_load_explicit(&mt->refcount, memory_order_acquire); + for (;;) + { + if (old <= 0) return 0; /* being freed or already freed */ + if (atomic_compare_exchange_weak_explicit(&mt->refcount, &old, old + 1, + memory_order_acq_rel, memory_order_acquire)) + { + return 1; + } + /* CAS failed, old was updated by the CAS, retry */ + } +} + +/** + * tidesdb_active_memtable_try_ref + * pinned acquire of the active memtable slot. the reader bumps the per-slot + * reader epoch, loads the slot, then try_ref's the loaded pointer. the epoch + * is dropped immediately after the try_ref outcome is known -- if try_ref + * succeeded the caller now holds a refcount ref so the struct cannot be freed + * out from under it, and if it failed the caller never touches the struct + * again. the immutable-cleanup loop drains this epoch to 0 before free()ing + * memtable structs, which closes the load/try_ref window that otherwise leaks + * a UAF on mt->refcount when cf->active_memtable's old target has been + * rotated to immutable, flushed, and unref'd to 0 in between the load and + * the try_ref. mirrors the imm_snap_t.readers epoch but for the direct read + * path through the active slot. + * @param epoch the per-slot reader epoch counter (cf->active_mt_readers or + * db->unified_mt.active_mt_readers) + * @param slot the atomic memtable pointer (&cf->active_memtable or + * &db->unified_mt.active) + * @param out_mt receives the pinned memtable on success, NULL on failure + * @return 1 if a memtable was pinned, 0 if the slot was empty or the loaded + * memtable had already been claimed for cleanup + */ +static int tidesdb_active_memtable_try_ref(_Atomic(int) *epoch, _Atomic(tidesdb_memtable_t *) *slot, + tidesdb_memtable_t **out_mt) +{ + atomic_fetch_add_explicit(epoch, 1, memory_order_acq_rel); + /* StoreLoad fence pairs with the cleanup drain's matching seq_cst fence. + * RMWs are full barriers on x86 but acq_rel RMW on aarch64/ppc is not, so + * the explicit fence is required for portability */ + atomic_thread_fence(memory_order_seq_cst); + tidesdb_memtable_t *mt = atomic_load_explicit(slot, memory_order_acquire); + int ok = mt ? tidesdb_memtable_try_ref(mt) : 0; + atomic_fetch_sub_explicit(epoch, 1, memory_order_release); + *out_mt = ok ? mt : NULL; + return ok; +} + +/** + * tidesdb_imm_snap_publish_locked + * rebuild the lock-free immutable snapshot from the current queue contents + * uses double-buffered RCU, building in inactive slot, swap active index, + * wait for old-slot readers to drain, then clear old slot + * + * must be called after every enqueue/dequeue on cf->immutable_memtables. + * the caller should already be in a context where the queue is stable + * (e.g. after queue_enqueue returns, or after queue_dequeue returns). + * + * the caller must hold cf->imm_snap_publish_lock -- the RCU scheme has a single + * inactive slot, so two concurrent publishers would rebuild the same slot's + * items[] array at once and produce a torn snapshot. + * + * @param cf column family whose snapshot to publish + */ +static void tidesdb_imm_snap_publish_locked(tidesdb_column_family_t *cf) +{ + const int active = atomic_load_explicit(&cf->imm_snap_active, memory_order_acquire); + const int next_idx = 1 - active; + tidesdb_imm_snap_t *next = &cf->imm_snaps[next_idx]; + + /* we wait for the inactive slot's readers to drain before overwriting it + * these are leftover readers from the previous publish's swap -- + * almost always 0 since readers are brief (single GET/iter search) */ + int spins = 0; + while (atomic_load_explicit(&next->readers, memory_order_acquire) > 0) + { + if (spins < TDB_IMM_SNAP_ACQUIRE_SPIN_LIMIT) + cpu_pause(); + else + cpu_yield(); + spins++; + } + + /* grow the inactive slot's array to fit the whole queue before snapshotting. + * we drained this slot's readers just above, so no reader can be indexing + * next->items -- the realloc here is safe. grows only, never shrinks. this is + * what lets the immutable queue honor any configured stall threshold without a + * fixed-size ceiling silently truncating the reader view. */ + size_t need = queue_size(cf->immutable_memtables); + if (need > next->cap) + { + size_t new_cap = next->cap ? next->cap : 1; + while (new_cap < need) new_cap *= 2; + tidesdb_memtable_t **grown = realloc(next->items, new_cap * sizeof(tidesdb_memtable_t *)); + if (grown) + { + next->items = grown; + next->cap = new_cap; + } + else + { + /* OOM -- keep the smaller array; the snapshot truncates to next->cap. + * only reachable under memory pressure, and self-heals on the next publish. */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' could not grow immutable snapshot to %zu, truncating to %zu", + cf->name, need, next->cap); + } + } + + /* we rebuild snapshot in the inactive slot from the queue + * no refs needed -- the RCU mechanism guarantees items are valid while any + * reader holds the slot. the queue itself holds the base ref on each item */ + size_t raw = queue_snapshot(cf->immutable_memtables, (void **)next->items, next->cap); + + /* drop already-flushed immutables from the READER snapshot. once an immutable is flushed its + * data is durable in an L1 sstable that was added to the level (with release) before `flushed` + * was set (also release), so any reader that observes this republished slot -- via the + * release/acquire pair on imm_snap_active below -- is guaranteed to also observe that sstable. + * excluding flushed immutables stops new iterators from taking a long-lived merge-source ref on + * them (tidesdb_merge_source_from_memtable), which is the only way their refcount can fall back + * to 1 so cleanup can reclaim them. without this, a steady stream of readers keeps re-pinning a + * flushed immutable, its refcount never reaches 1, the immutable queue cannot drain, and the + * flush worker wedges. the immutable stays in the queue until reclaimed -- only the snapshot + * drops it early. */ + size_t count = 0; + for (size_t i = 0; i < raw; i++) + { + tidesdb_memtable_t *m = next->items[i]; + if (m && atomic_load_explicit(&m->flushed, memory_order_acquire)) continue; + next->items[count++] = m; + } + atomic_store_explicit(&next->count, count, memory_order_release); + + /* we ensure the new slot contents are visible before swapping active index */ + atomic_thread_fence(memory_order_release); + + /*** we swap active index -- readers will now acquire the new slot + ** NON-BLOCKING**** old slot readers drain on their own, no spin-wait here + * this avoids the flush worker stalling on slow readers (sstable I/O) */ + atomic_store_explicit(&cf->imm_snap_active, next_idx, memory_order_release); +} + +/** + * tidesdb_imm_snap_publish + * acquire the per-CF publisher lock and rebuild + swap the immutable snapshot + * @param cf column family whose snapshot to publish + */ +static void tidesdb_imm_snap_publish(tidesdb_column_family_t *cf) +{ + pthread_mutex_lock(&cf->imm_snap_publish_lock); + tidesdb_imm_snap_publish_locked(cf); + pthread_mutex_unlock(&cf->imm_snap_publish_lock); +} + +/** + * tidesdb_imm_snap_drain_previous + * wait for the PREVIOUS active slot's readers to drain after a publish + * must be called before freeing items that were in the old snapshot + * only needed in the cleanup path (not rotation or recovery) + * @param cf column family + */ +static void tidesdb_imm_snap_drain_previous(tidesdb_column_family_t *cf) +{ + /* after a publish, the old active slot is now the inactive slot (1 - current) */ + const int current = atomic_load_explicit(&cf->imm_snap_active, memory_order_acquire); + const int old_idx = 1 - current; + tidesdb_imm_snap_t *old = &cf->imm_snaps[old_idx]; + + int spins = 0; + while (atomic_load_explicit(&old->readers, memory_order_acquire) > 0) + { + if (spins < TDB_IMM_SNAP_ACQUIRE_SPIN_LIMIT) + cpu_pause(); + else + cpu_yield(); + spins++; + } +} + +/** + * tidesdb_imm_snap_acquire + * acquire a read-side reference to the current immutable snapshot + * lock-free -- uses atomic load + atomic increment + double-check + * @param cf column family + * @return pointer to the active snapshot slot, or NULL if empty. + * caller must call tidesdb_imm_snap_release when done. + */ +static tidesdb_imm_snap_t *tidesdb_imm_snap_acquire(tidesdb_column_family_t *cf) +{ + int spins = 0; + while (1) + { + const int active = atomic_load_explicit(&cf->imm_snap_active, memory_order_acquire); + tidesdb_imm_snap_t *snap = &cf->imm_snaps[active]; + + /* we check if snapshot is empty before acquiring */ + if (atomic_load_explicit(&snap->count, memory_order_acquire) == 0) return NULL; + + /* we acquire reader reference */ + atomic_fetch_add_explicit(&snap->readers, 1, memory_order_acq_rel); + + /* double-sanity-check, if active index changed, we acquired the wrong (retiring) slot */ + if (atomic_load_explicit(&cf->imm_snap_active, memory_order_acquire) == active) + { + return snap; /* snapshot is current, proceed */ + } + + /* active changed -- we release stale slot and retry */ + atomic_fetch_sub_explicit(&snap->readers, 1, memory_order_release); + + if (spins < TDB_IMM_SNAP_ACQUIRE_SPIN_LIMIT) + cpu_pause(); + else + cpu_yield(); + spins++; + } +} + +/** + * tidesdb_imm_snap_release + * release a read-side reference to a snapshot slot + * @param snap snapshot slot previously returned by tidesdb_imm_snap_acquire + */ +static void tidesdb_imm_snap_release(tidesdb_imm_snap_t *snap) +{ + if (snap) atomic_fetch_sub_explicit(&snap->readers, 1, memory_order_release); +} + +/** + * tidesdb_snapshot_immutable_memtables + * take a snapshot of immutable memtables with per-item refs for callers that + * need to hold items beyond the snapshot lifetime (e.g. iterator creation). + * uses the lock-free snapshot internally, then refs each item individually. + * @param cf the column family + * @param out_count output count of items + * @return heap-allocated array of ref'd immutable pointers, or NULL if empty. + * caller must unref each item and free the array. + */ +static tidesdb_immutable_memtable_t **tidesdb_snapshot_immutable_memtables( + tidesdb_column_family_t *cf, size_t *out_count) +{ + if (out_count) *out_count = 0; + if (!cf) return NULL; + + tidesdb_imm_snap_t *snap = tidesdb_imm_snap_acquire(cf); + if (!snap) return NULL; + + const size_t count = atomic_load_explicit(&snap->count, memory_order_acquire); + if (count == 0) + { + tidesdb_imm_snap_release(snap); + return NULL; + } + + tidesdb_immutable_memtable_t **result = malloc(count * sizeof(tidesdb_immutable_memtable_t *)); + if (!result) + { + tidesdb_imm_snap_release(snap); + return NULL; + } + + size_t valid = 0; + for (size_t i = 0; i < count; i++) + { + tidesdb_immutable_memtable_t *imm = (tidesdb_immutable_memtable_t *)snap->items[i]; + if (tidesdb_immutable_memtable_try_ref(imm)) + { + result[valid++] = imm; + } + } + + tidesdb_imm_snap_release(snap); + + if (valid == 0) + { + free(result); + return NULL; + } + + if (out_count) *out_count = valid; + return result; +} + +/** + * tidesdb_write_vlog_entry + * write a large value to vlog and update kv with offset + * @param sst sstable + * @param vlog_bm vlog block manager + * @param kv key-value pair (vlog_offset updated on success) + * @param vlog_block_num counter to increment + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_write_vlog_entry(const tidesdb_sstable_t *sst, block_manager_t *vlog_bm, + tidesdb_kv_pair_t *kv, uint64_t *vlog_block_num) +{ + const uint8_t *final_data = kv->value; + size_t final_size = kv->entry.value_size; + uint8_t *compressed = NULL; + + if (sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = compress_data(kv->value, kv->entry.value_size, &compressed_size, + sst->config->compression_algorithm); + if (!compressed) + { + return TDB_ERR_CORRUPTION; + } + final_data = compressed; + final_size = compressed_size; + } + + block_manager_block_t *vlog_block = block_manager_block_create(final_size, final_data); + if (vlog_block) + { + const int64_t block_offset = block_manager_block_write(vlog_bm, vlog_block); + if (block_offset >= 0) + { + kv->entry.vlog_offset = (uint64_t)block_offset; + (*vlog_block_num)++; + } + block_manager_block_release(vlog_block); + } + + free(compressed); + return TDB_SUCCESS; +} + +/** + * tidesdb_flush_klog_block + * serialize and write a klog block to disk + * @param sst sstable + * @param klog_bm klog block manager + * @param block klog block to flush + * @param block_indexes optional block index to update + * @param block_first_key first key in block + * @param block_first_key_size size of first key + * @param block_last_key last key in block + * @param block_last_key_size size of last key + * @param klog_block_num block counter (incremented on success) + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_flush_klog_block(const tidesdb_sstable_t *sst, block_manager_t *klog_bm, + tidesdb_klog_block_t *block, + tidesdb_block_index_t *block_indexes, + const uint8_t *block_first_key, + const size_t block_first_key_size, + const uint8_t *block_last_key, const size_t block_last_key_size, + uint64_t *klog_block_num) +{ + if (block->num_entries == 0) return TDB_SUCCESS; + + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(block, &klog_data, &klog_size) != 0) + { + return TDB_ERR_MEMORY; + } + + uint8_t *final_klog_data = klog_data; + size_t final_klog_size = klog_size; + + if (sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = compress_data(klog_data, klog_size, &compressed_size, + sst->config->compression_algorithm); + if (compressed) + { + free(klog_data); + final_klog_data = compressed; + final_klog_size = compressed_size; + } + else + { + free(klog_data); + return TDB_ERR_CORRUPTION; + } + } + + block_manager_block_t *klog_block = + block_manager_block_create(final_klog_size, final_klog_data); + if (!klog_block) + { + free(final_klog_data); + return TDB_ERR_MEMORY; + } + + /* we capture file position before writing */ + const uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + + block_manager_block_write(klog_bm, klog_block); + block_manager_block_release(klog_block); + + /* we add to index if enabled and sampling matches */ + if (block_indexes && block_first_key && block_last_key) + { + if (*klog_block_num % sst->config->index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, block_first_key_size, + block_last_key, block_last_key_size, block_file_position); + } + } + + (*klog_block_num)++; + free(final_klog_data); + return TDB_SUCCESS; +} + +/** + * tidesdb_sstable_write_aux_blob + * writes a footer aux blob (serialized bloom filter or block index) as one or + * more consecutive blocks, each at most TDB_AUX_BLOCK_CHUNK_MAX bytes, so no + * single block exceeds the block manager's framing/read limits regardless of + * total blob size. a blob at or below the chunk size is written as exactly one + * block (identical on-disk layout to the pre-chunking single-block writes). + * @param bm klog block manager + * @param data blob bytes (size > 0) + * @param size blob size in bytes + * @param out_offset receives the offset of the first chunk + * @return TDB_SUCCESS, or TDB_ERR_IO on a write failure + */ +static int tidesdb_sstable_write_aux_blob(block_manager_t *bm, const uint8_t *data, uint64_t size, + uint64_t *out_offset) +{ + if (!bm || !data || size == 0 || !out_offset) return TDB_ERR_INVALID_ARGS; + + int64_t start = -1; + uint64_t written = 0; + while (written < size) + { + const uint64_t remaining = size - written; + const uint64_t chunk = + (remaining > TDB_AUX_BLOCK_CHUNK_MAX) ? TDB_AUX_BLOCK_CHUNK_MAX : remaining; + block_manager_block_t *blk = block_manager_block_create(chunk, data + written); + if (!blk) return TDB_ERR_IO; + const int64_t off = block_manager_block_write(bm, blk); + block_manager_block_release(blk); + if (off < 0) return TDB_ERR_IO; + if (start < 0) start = off; + written += chunk; + } + + *out_offset = (uint64_t)start; + return TDB_SUCCESS; +} + +/** + * tidesdb_sstable_read_aux_blob + * reassembles a chunked footer aux blob into a single buffer by reading + * consecutive blocks starting at offset until total bytes are gathered. refuses + * (NULL + warning, not a crash) if total exceeds the database memory-safety + * budget so a corrupt or pathological size cannot drive the process into OOM. + * @param db database (for the memory budget) + * @param bm klog block manager + * @param offset offset of the first chunk + * @param total total logical blob size in bytes + * @return malloc'd buffer of `total` bytes (caller frees), or NULL + */ +static uint8_t *tidesdb_sstable_read_aux_blob(tidesdb_t *db, block_manager_t *bm, uint64_t offset, + uint64_t total) +{ + if (!bm || total == 0) return NULL; + + const size_t budget = + db ? atomic_load_explicit(&db->resolved_memory_limit, memory_order_relaxed) : 0; + if (budget > 0 && total > (uint64_t)budget / TDB_MEMORY_MAX_BLOCK_FRACTION_DENOM) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "aux blob of %" PRIu64 + " bytes exceeds memory-safety budget (%zu) -- skipping", + total, budget); + return NULL; + } + if (total > SIZE_MAX) return NULL; /* 32-bit host guard */ + + uint8_t *buf = malloc((size_t)total); + if (!buf) return NULL; + + block_manager_cursor_t *cur = NULL; + if (block_manager_cursor_init(&cur, bm) != 0 || block_manager_cursor_goto(cur, offset) != 0) + { + if (cur) block_manager_cursor_free(cur); + free(buf); + return NULL; + } + + uint64_t got = 0; + while (got < total) + { + block_manager_block_t *blk = block_manager_cursor_read(cur); + if (!blk || got + blk->size > total) + { + if (blk) block_manager_block_release(blk); + block_manager_cursor_free(cur); + free(buf); + return NULL; + } + memcpy(buf + got, blk->data, blk->size); + got += blk->size; + block_manager_block_release(blk); + if (got < total && block_manager_cursor_next(cur) != 0) + { + block_manager_cursor_free(cur); + free(buf); + return NULL; + } + } + + block_manager_cursor_free(cur); + return buf; +} + +/** + * tidesdb_sstable_write_footer_aux + * writes the block index (optional) and bloom filter footer blobs for sst, + * chunk-aware in that a blob larger than TDB_AUX_BLOCK_CHUNK_MAX is split across + * consecutive blocks and the chunked-aux descriptor (offset+size) is recorded on + * sst, so a bloom/index of any size (incl. >4GB) round-trips; a small blob is + * written as a single block (byte-identical to the legacy footer). ownership of + * block_indexes and bloom transfers to sst. callers write the metadata block + * afterward -- metadata serialize reads sst->aux_chunked and the offsets. shared + * by every flush and merge writer so they all get chunking uniformly. + * @param sst sstable being written + * @param klog_bm klog block manager + * @param block_indexes block index (NULL -> empty placeholder); used iff write_index + * @param bloom bloom filter (NULL -> empty placeholder) + * @param write_index 1 to emit an index block (block format), 0 for btree (bloom only) + */ +static void tidesdb_sstable_write_footer_aux(tidesdb_sstable_t *sst, block_manager_t *klog_bm, + tidesdb_block_index_t *block_indexes, + bloom_filter_t *bloom, int write_index) +{ + uint64_t index_off = 0; + uint64_t bloom_off = 0; + size_t index_size = 0; + size_t bloom_size = 0; + int index_chunked = 0; + + /* index first, then bloom -- matches the legacy trailing-block order + * (index, bloom, metadata) used by the non-chunked read path */ + if (write_index) + { + uint8_t index_placeholder[TDB_EMPTY_BLOCK_INDEX_SIZE]; + uint8_t *index_data = NULL; + uint8_t *index_owned = NULL; + if (block_indexes) + { + sst->block_indexes = block_indexes; + index_data = compact_block_index_serialize(block_indexes, &index_size); + index_owned = index_data; + } + if (!index_data) + { + encode_uint32_le_compat(index_placeholder, 0); + index_placeholder[sizeof(uint32_t)] = TDB_DEFAULT_BLOCK_INDEX_PREFIX_LEN; + index_data = index_placeholder; + index_size = TDB_EMPTY_BLOCK_INDEX_SIZE; + } + tidesdb_sstable_write_aux_blob(klog_bm, index_data, index_size, &index_off); + index_chunked = (index_size > TDB_AUX_BLOCK_CHUNK_MAX); + free(index_owned); + } + + uint8_t bloom_placeholder[1] = {0}; + uint8_t *bloom_data = NULL; + uint8_t *bloom_owned = NULL; + if (bloom) + { + bloom_data = bloom_filter_serialize(bloom, &bloom_size); + bloom_owned = bloom_data; + sst->bloom_filter = bloom; + } + if (!bloom_data) + { + bloom_data = bloom_placeholder; + bloom_size = 1; + } + tidesdb_sstable_write_aux_blob(klog_bm, bloom_data, bloom_size, &bloom_off); + free(bloom_owned); + + if (index_chunked || bloom_size > TDB_AUX_BLOCK_CHUNK_MAX) + { + sst->aux_chunked = 1; + sst->index_blob_offset = write_index ? index_off : 0; + sst->index_blob_size = write_index ? index_size : 0; + sst->bloom_blob_offset = bloom_off; + sst->bloom_blob_size = bloom_size; + TDB_DEBUG_LOG(TDB_LOG_INFO, + "SSTable %" PRIu64 " footer aux chunked (index %zu B, bloom %zu B)", sst->id, + write_index ? index_size : (size_t)0, bloom_size); + } +} + +/** + * tidesdb_sstable_write_footer + * write index, bloom filter, and metadata blocks to klog + * @param sst sstable (block_indexes and bloom_filter assigned here) + * @param klog_bm klog block manager + * @param vlog_bm vlog block manager + * @param block_indexes block indexes (ownership transferred to sst) + * @param bloom bloom filter (ownership transferred to sst) + * @return TDB_SUCCESS on success + */ +static int tidesdb_sstable_write_footer(tidesdb_sstable_t *sst, block_manager_t *klog_bm, + block_manager_t *vlog_bm, + tidesdb_block_index_t *block_indexes, bloom_filter_t *bloom) +{ + /* we capture klog file offset where data blocks end */ + block_manager_get_size(klog_bm, &sst->klog_data_end_offset); + + /* we write index block */ + if (block_indexes) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "SSTable " TDB_U64_FMT " block indexes built - %" PRIu32 + " samples, " TDB_U64_FMT " total blocks", + TDB_U64_CAST(sst->id), block_indexes->count, + TDB_U64_CAST(sst->num_klog_blocks)); + } + + /* write the index + bloom footer blobs (chunk-aware, shared with the merge writers) */ + tidesdb_sstable_write_footer_aux(sst, klog_bm, block_indexes, bloom, 1); + + /* we write metadata block */ + uint64_t klog_size_before_metadata; + uint64_t vlog_size_before_metadata; + block_manager_get_size(klog_bm, &klog_size_before_metadata); + block_manager_get_size(vlog_bm, &vlog_size_before_metadata); + + sst->klog_size = klog_size_before_metadata; + sst->vlog_size = vlog_size_before_metadata; + + uint8_t *metadata_data = NULL; + size_t metadata_size = 0; + if (sstable_metadata_serialize(sst, &metadata_data, &metadata_size) == 0) + { + block_manager_block_t *metadata_block = + block_manager_block_create(metadata_size, metadata_data); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata_data); + } + + /* we get final file sizes */ + block_manager_get_size(klog_bm, &sst->klog_size); + block_manager_get_size(vlog_bm, &sst->vlog_size); + + if (klog_bm) block_manager_escalate_fsync(klog_bm); + if (vlog_bm) block_manager_escalate_fsync(vlog_bm); + + return TDB_SUCCESS; +} + +/** + * tidesdb_sstable_write_from_memtable_btree_ex + * write a memtable (or one cf's prefix segment of the shared unified memtable) to a B+tree sstable. + * the seg_prefix machinery mirrors tidesdb_sstable_write_from_memtable_ex -- non-NULL seeks to the + * cf_index prefix, strips it from each key, and stops at the first key outside the run. + * @param db database instance + * @param sst sstable to write to + * @param memtable memtable to write from + * @param seg_prefix cf_index prefix to restrict to, or NULL for the whole memtable + * @param seg_prefix_len length of seg_prefix in bytes (0 when seg_prefix is NULL) + * @param seg_entry_count entry-count hint for sizing, used only when seg_prefix is non-NULL + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_write_from_memtable_btree_ex(tidesdb_t *db, tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, + skip_list_t *memtable, + const uint8_t *seg_prefix, + size_t seg_prefix_len, int seg_entry_count) +{ + if (!db || !cf || !sst || !memtable) return TDB_ERR_INVALID_ARGS; + + const int num_entries = seg_prefix ? seg_entry_count : skip_list_count_entries(memtable); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "SSTable %" PRIu64 " writing from memtable using B+tree (%d entries)", sst->id, + num_entries); + + if (tidesdb_sstable_ensure_open(db, sst) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to ensure open", sst->id); + return TDB_ERR_IO; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to get block managers", sst->id); + return TDB_ERR_IO; + } + + block_manager_t *klog_bm = bms.klog_bm; + block_manager_t *vlog_bm = bms.vlog_bm; + + /* resolve comparator from column family config */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(db, sst->config, &comparator_fn, &comparator_ctx); + + /* we create btree builder with column family's comparator + * btree uses BTREE_CMP_CUSTOM when a custom comparator is provided */ + const btree_config_t btree_config = { + .target_node_size = BTREE_DEFAULT_NODE_SIZE, + .value_threshold = sst->config->klog_value_threshold, + .comparator = (btree_comparator_fn)comparator_fn, + .comparator_ctx = comparator_ctx, + .cmp_type = comparator_fn ? BTREE_CMP_CUSTOM : BTREE_CMP_MEMCMP, + .compression_algo = sst->config->compression_algorithm}; + + btree_builder_t *builder = NULL; + if (btree_builder_new(&builder, klog_bm, &btree_config) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to create btree builder", sst->id); + return TDB_ERR_MEMORY; + } + + /* we create bloom filter if enabled */ + bloom_filter_t *bloom = NULL; + if (sst->config->enable_bloom_filter) + { + if (bloom_filter_new(&bloom, sst->config->bloom_fpr, num_entries) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to create bloom filter", + sst->id); + btree_builder_free(builder); + return TDB_ERR_MEMORY; + } + } + + /* iterate memtable and add entries to btree */ + skip_list_cursor_t *cursor = NULL; + if (skip_list_cursor_init(&cursor, memtable) != 0) + { + if (bloom) bloom_filter_free(bloom); + btree_builder_free(builder); + return TDB_ERR_MEMORY; + } + + /* init parks on the first key; a unified segment seeks into its cf_index prefix run instead */ + if (seg_prefix) (void)skip_list_cursor_seek_ge(cursor, seg_prefix, seg_prefix_len); + + uint64_t entry_count = 0; + uint64_t tombstone_count = 0; + uint64_t max_seq = 0; + int aborted = 0; + int segment_done = 0; /* set when a unified segment's prefix run ends */ + + /* snapshot floor -- retain older versions on a key while any active reader at + * a snapshot below the latest still needs them. stop after the version <= floor + * since that one is dominated for every active snapshot */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(db); + + while (skip_list_cursor_valid(cursor)) + { + /* flush progress heartbeat -- lets backpressure tell a slow flush from a wedged one */ + atomic_fetch_add_explicit(&db->flush_heartbeat, 1, memory_order_relaxed); + + /* flushes only abort on a real CF drop, never on cancel_background_work -- + * a flush is the durability path and must complete */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + aborted = 1; + break; + } + + while (1) + { + uint8_t *key = NULL; + size_t key_size = 0; + uint8_t *value = NULL; + size_t value_size = 0; + uint64_t seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (skip_list_cursor_get_with_seq(cursor, &key, &key_size, &value, &value_size, &ttl, + &deleted, &seq) != 0) + { + break; + } + + /* unified segment -- a key outside the cf_index prefix ends this cf's run, else strip + * the prefix so the cf sstable stores the real user key (see the block writer) */ + if (seg_prefix) + { + if (key_size < seg_prefix_len || memcmp(key, seg_prefix, seg_prefix_len) != 0) + { + segment_done = 1; + break; + } + key += seg_prefix_len; + key_size -= seg_prefix_len; + } + + /* we write value to vlog if it exceeds the threshold, matching the + * compaction merge path. small values are stored inline in the btree. */ + uint64_t vlog_offset = 0; + if (value && value_size > 0 && !deleted && + value_size >= sst->config->klog_value_threshold) + { + const uint8_t *final_data = value; + size_t final_size = value_size; + uint8_t *compressed = NULL; + + if (sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = compress_data(value, value_size, &compressed_size, + sst->config->compression_algorithm); + if (compressed) + { + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *vlog_block = + block_manager_block_create(final_size, final_data); + if (vlog_block) + { + const int64_t offset = block_manager_block_write(vlog_bm, vlog_block); + if (offset >= 0) + { + vlog_offset = (uint64_t)offset; + } + block_manager_block_release(vlog_block); + } + free(compressed); + } + + /* we add to btree inline value for small entries, vlog reference for large. + * deleted carries the full skip-list flag byte so single-delete survives + * the flush into the btree sstable's on-disk flag byte. the low bit of + * deleted equals BTREE_ENTRY_FLAG_TOMBSTONE by design so callers that + * previously passed a 0/1 bool still behave unchanged. */ + const uint8_t *value_to_store = (vlog_offset > 0) ? NULL : value; + const size_t value_size_to_store = (vlog_offset > 0) ? 0 : value_size; + uint8_t entry_flags = 0; + if (deleted & SKIP_LIST_FLAG_DELETED) entry_flags |= BTREE_ENTRY_FLAG_TOMBSTONE; + if (deleted & SKIP_LIST_FLAG_SINGLE_DELETE) + entry_flags |= BTREE_ENTRY_FLAG_SINGLE_DELETE; + + if (btree_builder_add(builder, key, key_size, value_to_store, value_size_to_store, + vlog_offset, seq, ttl, entry_flags) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to add entry to btree", + sst->id); + } + + /* we add to bloom filter */ + if (bloom) + { + bloom_filter_add(bloom, key, key_size); + } + + if (seq > max_seq) max_seq = seq; + entry_count++; + if (entry_flags & BTREE_ENTRY_FLAG_TOMBSTONE) tombstone_count++; + + if (seq <= min_snapshot_seq) break; + if (skip_list_cursor_advance_in_node(cursor) != 0) break; + } + if (segment_done) break; + + skip_list_cursor_next(cursor); + } + + skip_list_cursor_free(cursor); + + if (aborted) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' aborting btree flush write for SSTable %" PRIu64, + cf->name, sst->id); + if (bloom) bloom_filter_free(bloom); + btree_builder_free(builder); + return TDB_SUCCESS; + } + + /* we finish btree build */ + btree_t *tree = NULL; + if (btree_builder_finish(builder, &tree) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to finish btree", sst->id); + if (bloom) bloom_filter_free(bloom); + btree_builder_free(builder); + return TDB_ERR_IO; + } + + /* we copy btree metadata to sstable */ + sst->use_btree = 1; + sst->btree_root_offset = tree->root_offset; + sst->btree_first_leaf = tree->first_leaf_offset; + sst->btree_last_leaf = tree->last_leaf_offset; + sst->btree_node_count = tree->node_count; + sst->btree_height = tree->height; + sst->num_entries = entry_count; + sst->tombstone_count = tombstone_count; + sst->max_seq = max_seq; + + /* we copy min/max keys */ + if (tree->min_key && tree->min_key_size > 0) + { + sst->min_key = malloc(tree->min_key_size); + if (sst->min_key) + { + memcpy(sst->min_key, tree->min_key, tree->min_key_size); + sst->min_key_size = tree->min_key_size; + } + } + if (tree->max_key && tree->max_key_size > 0) + { + sst->max_key = malloc(tree->max_key_size); + if (sst->max_key) + { + memcpy(sst->max_key, tree->max_key, tree->max_key_size); + sst->max_key_size = tree->max_key_size; + } + } + + btree_free(tree); + btree_builder_free(builder); + + /* write the bloom footer blob (chunk-aware, no index block in btree format) */ + tidesdb_sstable_write_footer_aux(sst, klog_bm, NULL, bloom, 0); + + uint64_t klog_size_before_metadata; + uint64_t vlog_size_before_metadata; + block_manager_get_size(klog_bm, &klog_size_before_metadata); + block_manager_get_size(vlog_bm, &vlog_size_before_metadata); + + sst->klog_size = klog_size_before_metadata; + sst->vlog_size = vlog_size_before_metadata; + + uint8_t *metadata_data = NULL; + size_t metadata_size = 0; + if (sstable_metadata_serialize(sst, &metadata_data, &metadata_size) == 0) + { + block_manager_block_t *metadata_block = + block_manager_block_create(metadata_size, metadata_data); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata_data); + } + + block_manager_get_size(klog_bm, &sst->klog_size); + block_manager_get_size(vlog_bm, &sst->vlog_size); + + if (klog_bm) block_manager_escalate_fsync(klog_bm); + if (vlog_bm) block_manager_escalate_fsync(vlog_bm); + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "SSTable %" PRIu64 " btree flush complete: %" PRIu64 " entries, root=%ld", + sst->id, entry_count, sst->btree_root_offset); + + return TDB_SUCCESS; +} + +/** + * tidesdb_sstable_write_from_memtable_btree + * write a whole memtable to a B+tree sstable (the common per-cf flush path) + * @param db database instance + * @param sst sstable to write to + * @param memtable memtable to write from + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_write_from_memtable_btree(tidesdb_t *db, tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, skip_list_t *memtable) +{ + return tidesdb_sstable_write_from_memtable_btree_ex(db, cf, sst, memtable, NULL, 0, 0); +} + +/** + * tidesdb_sstable_write_from_heap_btree + * write merged entries from a heap to an sstable using B+tree format + * @param cf column family + * @param sst sstable to write to + * @param heap merge heap containing entries + * @param klog_bm klog block manager (already open) + * @param vlog_bm vlog block manager (already open) + * @param bloom bloom filter (optional, may be NULL) + * @param sstables_to_delete queue for corrupted sstables + * @param is_largest_level whether this is the largest level + * @return 0 on success, error code on failure + */ +static int tidesdb_sstable_write_from_heap_btree(tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, tidesdb_merge_heap_t *heap, + block_manager_t *klog_bm, block_manager_t *vlog_bm, + bloom_filter_t *bloom, queue_t *sstables_to_delete, + const int is_largest_level) +{ + if (!cf || !sst || !heap || !klog_bm || !vlog_bm) return TDB_ERR_INVALID_ARGS; + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + + const btree_config_t btree_config = { + .target_node_size = BTREE_DEFAULT_NODE_SIZE, + .value_threshold = cf->config.klog_value_threshold, + .cmp_type = comparator_fn ? BTREE_CMP_CUSTOM : BTREE_CMP_MEMCMP, + .comparator = (btree_comparator_fn)comparator_fn, + .comparator_ctx = comparator_ctx, + .compression_algo = cf->config.compression_algorithm, + }; + + btree_builder_t *builder = NULL; + if (btree_builder_new(&builder, klog_bm, &btree_config) != 0) + { + return TDB_ERR_MEMORY; + } + + uint64_t entry_count = 0; + uint64_t tombstone_count = 0; + uint64_t max_seq = 0; + uint64_t vlog_block_num = 0; + + /* snapshot floor -- older same-key versions are kept while the newest version + * is past the oldest active snapshot, so an in-progress reader at a lower seq + * still has a visible record. UINT64_MAX means no snapshot-fixed txn is open */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(cf->db); + + /* we keep one kv buffered ("pending") so we can do a single-step lookahead. + * the merge heap emits same-key versions in (key asc, seq desc) order, so + * after we pop the newest version for a key we peek the next pop to see + * whether an older same-key version follows. that lookahead lets us detect + * a put+single-delete pair in one merge input and drop both together at + * any level instead of carrying the single-delete forward. it also keeps + * the original same-key dedup, largest-level tombstone drop, and ttl drop + * behaviours -- they now fire when pending gets resolved rather than the + * moment pending was popped. */ + tidesdb_kv_pair_t *pending = NULL; + int pending_is_single_delete = 0; + int pending_sd_paired_with_put = 0; + + int abort_io = 0; + + while (!tidesdb_merge_heap_empty(heap) || pending != NULL) + { + tidesdb_kv_pair_t *kv = NULL; + + if (!tidesdb_merge_heap_empty(heap)) + { + tidesdb_sstable_t *corrupted_sst = NULL; + kv = tidesdb_merge_heap_pop(heap, &corrupted_sst); + + if (corrupted_sst && sstables_to_delete) + { + queue_enqueue(sstables_to_delete, corrupted_sst); + } + + if (!kv) + { + /* heap is drained -- fall through to flush pending */ + } + } + + if (kv && pending && pending->entry.key_size == kv->entry.key_size && + memcmp(pending->key, kv->key, pending->entry.key_size) == 0 && + pending->entry.seq <= min_snapshot_seq) + { + /* older same-key version -- drop silently. if pending is a + * single-delete and this older version is a live put (not itself + * a tombstone), we've found the put+single-delete pair and can + * cancel the single-delete once we finish consuming the group. */ + if (pending_is_single_delete && !(kv->entry.flags & TDB_KV_FLAG_TOMBSTONE)) + { + pending_sd_paired_with_put = 1; + } + tidesdb_kv_pair_free(kv); + continue; + } + + /* new key arrived (or heap exhausted) -- decide the fate of pending */ + if (pending) + { + const int sd_pair_drop = pending_is_single_delete && pending_sd_paired_with_put; + const int tombstone_drop = (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) && + is_largest_level && pending->entry.seq <= min_snapshot_seq; + const int ttl_drop = + pending->entry.ttl > 0 && + pending->entry.ttl < + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed); + + if (!sd_pair_drop && !tombstone_drop && !ttl_drop) + { + if (bloom) + { + bloom_filter_add(bloom, pending->key, pending->entry.key_size); + } + + uint64_t vlog_offset = 0; + if (pending->entry.value_size >= cf->config.klog_value_threshold && pending->value) + { + const uint8_t *final_data = pending->value; + size_t final_size = pending->entry.value_size; + uint8_t *compressed = NULL; + + if (sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = + compress_data(pending->value, pending->entry.value_size, + &compressed_size, sst->config->compression_algorithm); + if (compressed) + { + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *vlog_block = + block_manager_block_create(final_size, final_data); + if (vlog_block) + { + const int64_t block_offset = block_manager_block_write(vlog_bm, vlog_block); + if (block_offset >= 0) + { + vlog_offset = (uint64_t)block_offset; + vlog_block_num++; + } + block_manager_block_release(vlog_block); + } + free(compressed); + } + + const uint8_t *value_to_store = (vlog_offset > 0) ? NULL : pending->value; + const size_t value_size_to_store = + (vlog_offset > 0) ? 0 : pending->entry.value_size; + const uint8_t entry_flags = + pending->entry.flags & (TDB_KV_FLAG_TOMBSTONE | TDB_KV_FLAG_SINGLE_DELETE); + + if (btree_builder_add(builder, pending->key, pending->entry.key_size, + value_to_store, value_size_to_store, vlog_offset, + pending->entry.seq, pending->entry.ttl, entry_flags) != 0) + { + abort_io = 1; + } + else + { + if (pending->entry.seq > max_seq) max_seq = pending->entry.seq; + + if (!sst->min_key) + { + sst->min_key = malloc(pending->entry.key_size); + if (sst->min_key) + { + memcpy(sst->min_key, pending->key, pending->entry.key_size); + sst->min_key_size = pending->entry.key_size; + } + } + + free(sst->max_key); + sst->max_key = malloc(pending->entry.key_size); + if (sst->max_key) + { + memcpy(sst->max_key, pending->key, pending->entry.key_size); + sst->max_key_size = pending->entry.key_size; + } + + entry_count++; + if (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) tombstone_count++; + } + } + + tidesdb_kv_pair_free(pending); + pending = NULL; + + if (abort_io) + { + if (kv) tidesdb_kv_pair_free(kv); + btree_builder_free(builder); + return TDB_ERR_IO; + } + } + + if (!kv) break; + + pending = kv; + pending_is_single_delete = (kv->entry.flags & TDB_KV_FLAG_SINGLE_DELETE) != 0; + pending_sd_paired_with_put = 0; + } + + btree_t *tree = NULL; + if (btree_builder_finish(builder, &tree) != 0 || !tree) + { + btree_builder_free(builder); + return TDB_ERR_IO; + } + + sst->btree_root_offset = tree->root_offset; + sst->btree_first_leaf = tree->first_leaf_offset; + sst->btree_last_leaf = tree->last_leaf_offset; + sst->btree_node_count = tree->node_count; + sst->btree_height = tree->height; + sst->num_entries = entry_count; + sst->tombstone_count = tombstone_count; + sst->max_seq = max_seq; + sst->num_vlog_blocks = vlog_block_num; + + block_manager_get_size(klog_bm, &sst->klog_data_end_offset); + block_manager_get_size(klog_bm, &sst->klog_size); + block_manager_get_size(vlog_bm, &sst->vlog_size); + + /* write the bloom footer blob (chunk-aware, no index block in btree format) */ + tidesdb_sstable_write_footer_aux(sst, klog_bm, NULL, bloom, 0); + + uint8_t *metadata = NULL; + size_t metadata_size = 0; + if (sstable_metadata_serialize(sst, &metadata, &metadata_size) == 0 && metadata) + { + block_manager_block_t *metadata_block = block_manager_block_create(metadata_size, metadata); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata); + } + + btree_free(tree); + btree_builder_free(builder); + + if (klog_bm) block_manager_escalate_fsync(klog_bm); + if (vlog_bm) block_manager_escalate_fsync(vlog_bm); + + return TDB_SUCCESS; +} + +/** + * tidesdb_sstable_write_from_memtable_ex + * write a memtable (or one cf's prefix segment of the shared unified memtable) to an sstable. + * when seg_prefix is non-NULL the cursor seeks to that cf_index prefix and each key has the prefix + * stripped before it is written, and the walk stops at the first key outside the prefix -- so a + * single cf's run inside the unified skip list is written straight to its sstable with no + * intermediate per-cf skip list. seg_entry_count sizes the bloom/index for the segment, since + * skip_list_count_entries would count the whole unified skip list. + * @param db database instance + * @param sst sstable to write to + * @param memtable memtable to write from + * @param seg_prefix cf_index prefix to restrict to, or NULL for the whole memtable + * @param seg_prefix_len length of seg_prefix in bytes (0 when seg_prefix is NULL) + * @param seg_entry_count entry-count hint for sizing, used only when seg_prefix is non-NULL + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_write_from_memtable_ex(tidesdb_t *db, tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, skip_list_t *memtable, + const uint8_t *seg_prefix, size_t seg_prefix_len, + int seg_entry_count) +{ + if (!db || !cf || !sst || !memtable) return TDB_ERR_INVALID_ARGS; + + const int num_entries = seg_prefix ? seg_entry_count : skip_list_count_entries(memtable); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "SSTable %" PRIu64 " writing from memtable (sorted run to disk) (%d entries)", + sst->id, num_entries); + + /* we ensure sstable is open and get block managers */ + if (tidesdb_sstable_ensure_open(db, sst) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to ensure open", sst->id); + return TDB_ERR_IO; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to get block managers", sst->id); + return TDB_ERR_IO; + } + + /* we create bloom filter and block indexes */ + int result = TDB_SUCCESS; + bloom_filter_t *bloom = NULL; + tidesdb_block_index_t *block_indexes = NULL; + tidesdb_klog_block_t *current_klog_block = NULL; + skip_list_cursor_t *cursor = NULL; + uint8_t *first_key = NULL; + uint8_t *last_key = NULL; + uint8_t *block_first_key = NULL; + uint8_t *block_last_key = NULL; + + /* we resolve comparator once for the entire flush operation */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(sst->db, sst->config, &comparator_fn, &comparator_ctx); + + if (sst->config->enable_bloom_filter) + { + if (bloom_filter_new(&bloom, sst->config->bloom_fpr, num_entries) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to create bloom filter", + sst->id); + return TDB_ERR_MEMORY; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, + "SSTable %" PRIu64 " bloom filter created (fpr: %.4f, entries: %d)", sst->id, + sst->config->bloom_fpr, num_entries); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "SSTable %" PRIu64 " bloom filter disabled", sst->id); + } + + if (sst->config->enable_block_indexes && !sst->config->use_btree) + { + uint32_t initial_capacity = (num_entries / sst->config->index_sample_ratio) + 1; + block_indexes = compact_block_index_create( + initial_capacity, sst->config->block_index_prefix_len, comparator_fn, comparator_ctx); + if (!block_indexes) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to create block indexes", + sst->id); + result = TDB_ERR_MEMORY; + goto cleanup; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "SSTable %" PRIu64 " block indexes enabled (sample ratio: %d)", + sst->id, sst->config->index_sample_ratio); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "SSTable %" PRIu64 " block indexes disabled", sst->id); + } + + /* we initialize klog block and cursor */ + current_klog_block = tidesdb_klog_block_create(); + if (!current_klog_block) + { + result = TDB_ERR_MEMORY; + goto cleanup; + } + + if (skip_list_cursor_init(&cursor, memtable) != 0) + { + result = TDB_ERR_MEMORY; + goto cleanup; + } + + /* we iterate memtable and write entries */ + uint64_t klog_block_num = 0; + uint64_t vlog_block_num = 0; + size_t first_key_size = 0; + size_t last_key_size = 0; + uint64_t entry_count = 0; + uint64_t tombstone_count = 0; + uint64_t max_seq = 0; + size_t block_first_key_size = 0; + size_t block_last_key_size = 0; + + /* seek into the cf's prefix run for a unified segment, else start at the first key */ + const int positioned = seg_prefix + ? (skip_list_cursor_seek_ge(cursor, seg_prefix, seg_prefix_len) == 0) + : (skip_list_cursor_goto_first(cursor) == 0); + if (positioned) + { + size_t block_first_key_capacity = 0; + size_t block_last_key_capacity = 0; + size_t first_key_capacity = 0; + size_t last_key_capacity = 0; + /* we use stack-allocated KV pair to avoid malloc/free per entry */ + tidesdb_kv_pair_t kv_stack = {0}; + int segment_done = 0; /* set when a unified segment's prefix run ends */ + + /* snapshot floor -- see tidesdb_sstable_write_from_memtable_btree for rationale */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(db); + + do + { + /* flush progress heartbeat -- lets backpressure tell a slow flush from a wedged one */ + atomic_fetch_add_explicit(&db->flush_heartbeat, 1, memory_order_relaxed); + + /* flushes only abort on a real CF drop, never on cancel_background_work -- + * a flush is the durability path and must complete */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' aborting flush write for SSTable %" PRIu64, + cf->name, sst->id); + result = TDB_SUCCESS; + goto cleanup; + } + + /* inner loop walks the version chain on the current node so each version + * still needed by an active snapshot lands on disk. stops after the first + * version <= floor */ + while (1) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &key, &key_size, &value, &value_size, + &ttl, &deleted, &seq) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Skipping entry during flush - cursor read failed (entry %" PRIu64 + ")", + entry_count); + break; + } + + /* unified segment -- a key outside the cf_index prefix ends this cf's run; + * otherwise strip the prefix so the cf sstable stores the real user key. all + * versions on a node share the key, so one check per node decides the whole node. + */ + if (seg_prefix) + { + if (key_size < seg_prefix_len || memcmp(key, seg_prefix, seg_prefix_len) != 0) + { + segment_done = 1; + break; + } + key += seg_prefix_len; + key_size -= seg_prefix_len; + } + + /* we populate stack-allocated KV pair (no malloc needed) */ + kv_stack.key = key; + kv_stack.value = value; + kv_stack.entry.key_size = (uint32_t)key_size; + kv_stack.entry.value_size = (uint32_t)value_size; + kv_stack.entry.ttl = ttl; + kv_stack.entry.seq = seq; + kv_stack.entry.flags = tidesdb_sl_flags_to_kv_flags(deleted); + if (ttl != 0) kv_stack.entry.flags |= TDB_KV_FLAG_HAS_TTL; + kv_stack.entry.vlog_offset = 0; + + /* we write large values to vlog */ + if (value_size >= sst->config->klog_value_threshold && !deleted && value) + { + result = tidesdb_write_vlog_entry(sst, bms.vlog_bm, &kv_stack, &vlog_block_num); + if (result != TDB_SUCCESS) + { + goto cleanup; + } + } + + /* we track first key of block */ + const int is_first_entry_in_block = (current_klog_block->num_entries == 0); + tidesdb_klog_block_add_entry(current_klog_block, &kv_stack, sst->config, + comparator_fn, comparator_ctx); + + /* we reuse block_first_key buffer with capacity tracking */ + if (is_first_entry_in_block) + { + if (key_size > block_first_key_capacity) + { + free(block_first_key); + block_first_key = malloc(key_size); + block_first_key_capacity = block_first_key ? key_size : 0; + } + if (block_first_key) + { + memcpy(block_first_key, key, key_size); + block_first_key_size = key_size; + } + } + + /* we reuse block_last_key buffer with capacity tracking */ + if (key_size > block_last_key_capacity) + { + free(block_last_key); + block_last_key = malloc(key_size); + block_last_key_capacity = block_last_key ? key_size : 0; + } + if (block_last_key) + { + memcpy(block_last_key, key, key_size); + block_last_key_size = key_size; + } + + /* we flush full klog block */ + if (tidesdb_klog_block_is_full(current_klog_block, TDB_KLOG_BLOCK_SIZE)) + { + result = tidesdb_flush_klog_block( + sst, bms.klog_bm, current_klog_block, block_indexes, block_first_key, + block_first_key_size, block_last_key, block_last_key_size, &klog_block_num); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " klog block flush failed", + sst->id); + goto cleanup; + } + + tidesdb_klog_block_reset(current_klog_block); + + /* we reset sizes but keep buffers for reuse */ + block_first_key_size = 0; + block_last_key_size = 0; + } + + /* we track max sequence */ + if (seq > max_seq) max_seq = seq; + + if (bloom) bloom_filter_add(bloom, key, key_size); + + /* we reuse first_key buffer with capacity tracking */ + if (first_key_size == 0) + { + if (key_size > first_key_capacity) + { + free(first_key); + first_key = malloc(key_size); + first_key_capacity = first_key ? key_size : 0; + } + if (first_key) + { + memcpy(first_key, key, key_size); + first_key_size = key_size; + } + } + + /* we reuse last_key buffer with capacity tracking */ + if (key_size > last_key_capacity) + { + free(last_key); + last_key = malloc(key_size); + last_key_capacity = last_key ? key_size : 0; + } + if (last_key) + { + memcpy(last_key, key, key_size); + last_key_size = key_size; + } + + sst->num_entries++; + entry_count++; + if (kv_stack.entry.flags & TDB_KV_FLAG_TOMBSTONE) tombstone_count++; + + if (seq <= min_snapshot_seq) break; + if (skip_list_cursor_advance_in_node(cursor) != 0) break; + } + if (segment_done) break; + } while (skip_list_cursor_next(cursor) == 0); + } + + skip_list_cursor_free(cursor); + cursor = NULL; + + /* we flush remaining klog block */ + if (current_klog_block && current_klog_block->num_entries > 0) + { + result = tidesdb_flush_klog_block(sst, bms.klog_bm, current_klog_block, block_indexes, + block_first_key, block_first_key_size, block_last_key, + block_last_key_size, &klog_block_num); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " final klog block flush failed", + sst->id); + goto cleanup; + } + } + + free(block_first_key); + free(block_last_key); + block_first_key = NULL; + block_last_key = NULL; + + tidesdb_klog_block_free(current_klog_block); + current_klog_block = NULL; + + /* we finalize sstable metadata */ + sst->num_entries = entry_count; + sst->tombstone_count = tombstone_count; + sst->num_klog_blocks = klog_block_num; + sst->num_vlog_blocks = vlog_block_num; + sst->min_key = first_key; + sst->min_key_size = first_key_size; + sst->max_key = last_key; + sst->max_key_size = last_key_size; + sst->max_seq = max_seq; + + /* ownership transferred to sst */ + first_key = NULL; + last_key = NULL; + + /* we write footer (index, bloom, metadata) */ + result = tidesdb_sstable_write_footer(sst, bms.klog_bm, bms.vlog_bm, block_indexes, bloom); + + /* ownership transferred to sst via footer */ + block_indexes = NULL; + bloom = NULL; + + return result; + +cleanup: + if (cursor) skip_list_cursor_free(cursor); + if (current_klog_block) tidesdb_klog_block_free(current_klog_block); + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + free(first_key); + free(last_key); + free(block_first_key); + free(block_last_key); + return result; +} + +/** + * tidesdb_sstable_write_from_memtable + * write a whole memtable to an sstable (the common per-cf flush path) + * @param db database instance + * @param sst sstable to write to + * @param memtable memtable to write from + * @return 0 on success, -1 on error + */ +static int tidesdb_sstable_write_from_memtable(tidesdb_t *db, tidesdb_column_family_t *cf, + tidesdb_sstable_t *sst, skip_list_t *memtable) +{ + return tidesdb_sstable_write_from_memtable_ex(db, cf, sst, memtable, NULL, 0, 0); +} + +/** + * tidesdb_sstable_get_btree + * get a key-value pair from a btree-based sstable + * @param db the database + * @param sst the sstable + * @param key the key + * @param key_size the size of the key + * @param seq_ceiling highest sequence number to consider (UINT64_MAX = newest) + * @param kv the key-value pair + */ +static int tidesdb_sstable_get_btree(tidesdb_t *db, tidesdb_sstable_t *sst, const uint8_t *key, + const size_t key_size, const uint64_t seq_ceiling, + tidesdb_kv_pair_t **kv) +{ + if (tidesdb_sstable_ensure_open(db, sst) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "SSTable %" PRIu64 " failed to ensure open (btree)", sst->id); + return TDB_ERR_IO; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + return TDB_ERR_IO; + } + + if (!sst->min_key || !sst->max_key) + { + return TDB_ERR_NOT_FOUND; + } + + /* we use cached comparator from sstable (resolved at load/create time) */ + skip_list_comparator_fn comparator_fn = sst->cached_comparator_fn; + void *comparator_ctx = sst->cached_comparator_ctx; + if (TDB_UNLIKELY(!comparator_fn)) + { + tidesdb_resolve_comparator(sst->db, sst->config, &comparator_fn, &comparator_ctx); + } + + const int min_cmp = + comparator_fn(key, key_size, sst->min_key, sst->min_key_size, comparator_ctx); + const int max_cmp = + comparator_fn(key, key_size, sst->max_key, sst->max_key_size, comparator_ctx); + + /* mirror the klog get path, a reverse comparator stores min_key/max_key in reverse + * user order, so the range gate must invert or a reverse-sorted btree sstable + * rejects every in-range key */ + if (sst->is_reverse) + { + if (min_cmp > 0 || max_cmp < 0) return TDB_ERR_NOT_FOUND; + } + else + { + if (min_cmp < 0 || max_cmp > 0) return TDB_ERR_NOT_FOUND; + } + + if (sst->bloom_filter) + { + PROFILE_INC(db, bloom_checks); + if (!bloom_filter_contains(sst->bloom_filter, key, key_size)) + { + return TDB_ERR_NOT_FOUND; + } + PROFILE_INC(db, bloom_hits); + } + + btree_t tree = {.bm = bms.klog_bm, + .root_offset = sst->btree_root_offset, + .first_leaf_offset = sst->btree_first_leaf, + .last_leaf_offset = sst->btree_last_leaf, + .config = {.target_node_size = BTREE_DEFAULT_NODE_SIZE, + .value_threshold = sst->config->klog_value_threshold, + .comparator = (btree_comparator_fn)comparator_fn, + .comparator_ctx = comparator_ctx, + .cmp_type = comparator_fn ? BTREE_CMP_CUSTOM : BTREE_CMP_MEMCMP, + .compression_algo = sst->config->compression_algorithm}, + .node_cache = db->btree_node_cache, + .cache_key_prefix = sst->cache_key_prefix}; + + uint8_t *value = NULL; + size_t value_size = 0; + uint64_t vlog_offset = 0; + uint64_t seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + const int result = btree_get_at_seq(&tree, key, key_size, seq_ceiling, &value, &value_size, + &vlog_offset, &seq, &ttl, &deleted); + if (result != 0) + { + return TDB_ERR_NOT_FOUND; + } + + /* we return tombstones so caller can handle cross-level masking + * the caller (tidesdb_txn_get) needs to see tombstones to properly + * mask keys in lower levels */ + if (deleted) + { + *kv = tidesdb_kv_pair_create(key, key_size, NULL, 0, ttl, seq, 1); + free(value); + if (!*kv) return TDB_ERR_MEMORY; + return TDB_SUCCESS; + } + + /* we check TTL */ + if (ttl > 0) + { + const int64_t now = (int64_t)atomic_load(&db->cached_current_time); + if (now > ttl) + { + free(value); + return TDB_ERR_NOT_FOUND; + } + } + + /* if value is in vlog, read it */ + if (vlog_offset > 0) + { + free(value); /* free placeholder if any */ + value = NULL; + + block_manager_cursor_t vlog_cursor; + if (block_manager_cursor_init_stack(&vlog_cursor, bms.vlog_bm) != 0) + { + return TDB_ERR_IO; + } + + uint8_t *vlog_value = NULL; + size_t vlog_value_size = 0; + if (tidesdb_btree_read_vlog_value(&vlog_cursor, vlog_offset, sst->config, &vlog_value, + &vlog_value_size, value_size) != 0) + { + return TDB_ERR_IO; + } + value = vlog_value; + value_size = vlog_value_size; + } + + /* we create kv pair */ + tidesdb_kv_pair_t *pair = malloc(sizeof(tidesdb_kv_pair_t)); + if (!pair) + { + free(value); + return TDB_ERR_MEMORY; + } + + pair->key = malloc(key_size); + if (!pair->key) + { + free(value); + free(pair); + return TDB_ERR_MEMORY; + } + memcpy(pair->key, key, key_size); + pair->entry.key_size = (uint32_t)key_size; + pair->value = value; + pair->entry.value_size = (uint32_t)value_size; + pair->entry.ttl = ttl; + pair->entry.seq = seq; + pair->entry.vlog_offset = vlog_offset; + pair->entry.flags = 0; + + *kv = pair; + return TDB_SUCCESS; +} + +/* thread-local used by seq-only mode in tidesdb_sstable_get (kv=NULL). + * avoids struct changes and heap allocation for conflict detection. */ +static _Thread_local uint64_t tdb_sst_get_seq_out; + +/** + * tidesdb_sstable_get + * get a key-value pair from an sstable. + * when kv is NULL, operates in seq-only mode finds the key and stores + * its sequence number in tdb_sst_get_seq_out without allocating a kv pair + * or reading the value from vlog. used by conflict detection. + * @param db the database + * @param sst the sstable + * @param key the key + * @param key_size the size of the key + * @param seq_ceiling highest sequence number to consider (UINT64_MAX = newest) + * @param kv the key-value pair (NULL for seq-only mode) + * @param skip_bloom if nonzero, skip bloom filter check + */ +static int tidesdb_sstable_get(tidesdb_t *db, tidesdb_sstable_t *sst, const uint8_t *key, + const size_t key_size, const uint64_t seq_ceiling, + tidesdb_kv_pair_t **kv, const int skip_bloom) +{ + /* we branch based on sstable type. + * btree path does not support seq-only mode (kv=NULL), so fall back + * to full get + extract for btree sstables. */ + if (sst->use_btree) + { + if (!kv) + { + tidesdb_kv_pair_t *tmp_kv = NULL; + const int rc = tidesdb_sstable_get_btree(db, sst, key, key_size, seq_ceiling, &tmp_kv); + if (rc == TDB_SUCCESS && tmp_kv) + { + tdb_sst_get_seq_out = tmp_kv->entry.seq; + tidesdb_kv_pair_free(tmp_kv); + } + return rc; + } + return tidesdb_sstable_get_btree(db, sst, key, key_size, seq_ceiling, kv); + } + + if (!sst->min_key || !sst->max_key) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "SSTable %" PRIu64 " has no min/max keys", sst->id); + return TDB_ERR_NOT_FOUND; + } + + /* we use cached comparator from sstable (resolved at load/create time) */ + skip_list_comparator_fn comparator_fn = sst->cached_comparator_fn; + void *comparator_ctx = sst->cached_comparator_ctx; + if (TDB_UNLIKELY(!comparator_fn)) + { + tidesdb_resolve_comparator(sst->db, sst->config, &comparator_fn, &comparator_ctx); + } + + const int min_cmp = + comparator_fn(key, key_size, sst->min_key, sst->min_key_size, comparator_ctx); + const int max_cmp = + comparator_fn(key, key_size, sst->max_key, sst->max_key_size, comparator_ctx); + + if (sst->is_reverse) + { + if (min_cmp > 0 || max_cmp < 0) return TDB_ERR_NOT_FOUND; + } + else + { + if (min_cmp < 0 || max_cmp > 0) return TDB_ERR_NOT_FOUND; + } + + /* we check bloom filter for early exit (after range check since bloom is more expensive). + * skip_bloom is set when boundary search at L1+ already identified this sstable, + * making the bloom check redundant. */ + if (sst->bloom_filter && !skip_bloom) + { + PROFILE_INC(db, bloom_checks); + if (!bloom_filter_contains(sst->bloom_filter, key, key_size)) + { + return TDB_ERR_NOT_FOUND; + } + PROFILE_INC(db, bloom_hits); + } + + /* we use cached CF name from sst struct to avoid repeated path parsing */ + const char *cf_name = sst->cf_name; + const int has_cf_name = (cf_name[0] != '\0'); + + /* we utilize block indexes to find the target klog block. + * when block index covers all blocks (index_sample_ratio == 1), we do a + * single-block lookup -- no scan loop needed. this eliminates the O(N) scan + * that was the #1 source of slow reads (each scanned block triggers decompress + * + deserialize + cache_put). */ + uint64_t start_file_position = 0; + int block_index_definitive = 0; + uint64_t block_index_run_len = 0; + if (sst->block_indexes && sst->block_indexes->count > 0) + { + int64_t start_slot = 0; + if (compact_block_index_find_slot(sst->block_indexes, key, key_size, &start_slot) == 0) + { + start_file_position = sst->block_indexes->file_positions[start_slot]; + /* the prefix index is lossy -- keys sharing a prefix longer than + * prefix_len span multiple blocks with identical min/max prefixes. + * the run length is how many consecutive blocks the lookup must + * scan to be definitive, not just the first */ + block_index_run_len = + compact_block_index_run_length(sst->block_indexes, key, key_size, start_slot); + /* block index covers all blocks when count matches num_klog_blocks + * (index_sample_ratio == 1). in this case the lookup is definitive: + * scanning the prefix-colliding run is enough -- if the key isn't in + * any block of the run, it's not in the sstable. */ + block_index_definitive = + (sst->block_indexes->count >= sst->num_klog_blocks && start_file_position > 0); + } + } + + /* when the file is frozen (not local) and the block index gives us a definitive + * single-block position, use range_get to fetch just that one block from the + * object store instead of downloading the entire sstable file. this turns a + * multi-second full-file download into a single ~50ms HTTP range request for + * 64KB. only valid when the prefix-colliding run is a single block -- a longer + * run needs the full-download scan path below to cover every candidate block. */ + if (db->object_store && block_index_definitive && start_file_position > 0 && + block_index_run_len <= 1 && !sst->klog_bm) + { + struct stat local_st; + if (stat(sst->klog_path, &local_st) != 0) + { + /* file not local -- we use range_get for this single block */ + block_manager_block_t *remote_block = NULL; + if (tidesdb_sstable_range_get_block(db, sst, start_file_position, &remote_block) != 0) + { + /* range_get failed, fall through to full download path */ + goto full_download_path; + } + + const uint8_t *search_data = remote_block->data; + size_t search_data_size = remote_block->size; + + /* we cache the block for future lookups */ + if (db->clock_cache && has_cf_name) + { + char cache_key[TDB_CACHE_KEY_SIZE]; + const size_t ck_len = tidesdb_block_cache_key( + cf_name, sst->klog_filename, start_file_position, cache_key, sizeof(cache_key)); + if (ck_len > 0) + { + uint8_t *indexed_data = NULL; + size_t indexed_size = 0; + if (tidesdb_build_indexed_block_data(search_data, search_data_size, + &indexed_data, &indexed_size) == 0) + { + tidesdb_cache_raw_block_put(db, cf_name, sst->klog_filename, + start_file_position, indexed_data, + indexed_size); + free(indexed_data); + } + else + { + tidesdb_cache_raw_block_put(db, cf_name, sst->klog_filename, + start_file_position, search_data, + search_data_size); + } + } + } + + tidesdb_klog_entry_t found_entry = {0}; + const uint8_t *found_key = NULL; + const uint8_t *found_value = NULL; + + const int search_rc = tidesdb_klog_block_search_raw( + search_data, search_data_size, key, key_size, seq_ceiling, comparator_fn, + comparator_ctx, &found_entry, &found_key, &found_value); + + if (search_rc != 0) + { + block_manager_block_release(remote_block); + return TDB_ERR_NOT_FOUND; + } + + const int is_tombstone = (found_entry.flags & TDB_KV_FLAG_TOMBSTONE); + + if (!is_tombstone && found_entry.ttl > 0) + { + const int64_t now = (int64_t)atomic_load(&db->cached_current_time); + if (now > found_entry.ttl) + { + block_manager_block_release(remote_block); + return TDB_ERR_NOT_FOUND; + } + } + + /* seq-only mode for remote path */ + if (!kv) + { + block_manager_block_release(remote_block); + tdb_sst_get_seq_out = found_entry.seq; + return TDB_SUCCESS; + } + + if (is_tombstone) + { + *kv = tidesdb_kv_pair_create(found_key, found_entry.key_size, NULL, 0, + found_entry.ttl, found_entry.seq, 1); + } + else if (found_entry.vlog_offset > 0) + { + *kv = tidesdb_kv_pair_create(found_key, found_entry.key_size, NULL, 0, + found_entry.ttl, found_entry.seq, 0); + if (*kv) + { + (*kv)->entry = found_entry; + (*kv)->entry.flags |= TDB_KV_FLAG_ARENA; + uint8_t *vlog_val = NULL; + if (tidesdb_vlog_range_get_value(db, sst, found_entry.vlog_offset, + found_entry.value_size, + &vlog_val) == TDB_SUCCESS) + { + (*kv)->value = vlog_val; + } + else + { + tidesdb_kv_pair_free(*kv); + *kv = NULL; + } + } + } + else + { + *kv = tidesdb_kv_pair_create(found_key, found_entry.key_size, found_value, + found_entry.value_size, found_entry.ttl, + found_entry.seq, 0); + if (*kv) + { + const uint8_t arena_flag = (*kv)->entry.flags & TDB_KV_FLAG_ARENA; + (*kv)->entry = found_entry; + (*kv)->entry.flags |= arena_flag; + } + } + + block_manager_block_release(remote_block); + + if (!*kv) return is_tombstone ? TDB_SUCCESS : TDB_ERR_MEMORY; + + PROFILE_INC(db, sstable_hits); + return TDB_SUCCESS; + } + } + +full_download_path: + /* file is local or range_get not applicable -- use standard ensure_open path */ + if (tidesdb_sstable_ensure_open(db, sst) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "SSTable %" PRIu64 " failed to ensure open", sst->id); + return TDB_ERR_IO; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + return TDB_ERR_IO; + } + + /* we initialize cursor using stack allocation */ + block_manager_cursor_t klog_cursor_stack; + block_manager_cursor_t *klog_cursor = &klog_cursor_stack; + + if (block_manager_cursor_init_stack(klog_cursor, bms.klog_bm) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " failed to initialize klog cursor", + sst->id); + return TDB_ERR_IO; + } + + if (start_file_position > 0) + { + block_manager_cursor_goto(klog_cursor, start_file_position); + } + else + { + block_manager_cursor_goto_first(klog_cursor); + } + + if (sst->klog_data_end_offset > 0 && klog_cursor->current_pos >= sst->klog_data_end_offset) + { + return TDB_ERR_NOT_FOUND; + } + + /* when block index is definitive we scan the prefix-colliding run -- one + * block for unique prefixes, a short contiguous run when keys share a prefix + * longer than prefix_len. still O(1) disk reads in the common case instead + * of the O(N) full scan. */ + const uint64_t max_blocks_to_scan = + block_index_definitive ? block_index_run_len : sst->num_klog_blocks; + + uint64_t blocks_scanned = 0; + + while (blocks_scanned < max_blocks_to_scan) + { + if (sst->klog_data_end_offset > 0 && klog_cursor->current_pos >= sst->klog_data_end_offset) + { + break; + } + + const uint64_t block_position = klog_cursor->current_pos; + + /* we get decompressed block bytes (zero-copy from cache, or from disk) */ + const uint8_t *search_data = NULL; + size_t search_data_size = 0; + clock_cache_entry_t *pinned_entry = NULL; /* zero-copy pin */ + block_manager_block_t *raw_block = NULL; + + if (db->clock_cache && has_cf_name) + { + char cache_key[TDB_CACHE_KEY_SIZE]; + const size_t ck_len = tidesdb_block_cache_key( + cf_name, sst->klog_filename, block_position, cache_key, sizeof(cache_key)); + if (ck_len > 0) + { + search_data = clock_cache_get_zero_copy(db->clock_cache, cache_key, ck_len, + &search_data_size, &pinned_entry); + if (search_data) + { + PROFILE_INC(db, cache_block_hits); + } + } + } + + if (!search_data) + { + /* cache miss -- read from disk. + * tidesdb_read_block_and_advance already decompresses internally, + * so raw_block->data is decompressed and ready for search+caching. */ + PROFILE_INC(db, cache_block_misses); + PROFILE_INC(db, disk_reads); + + raw_block = tidesdb_read_block_and_advance(db, sst, klog_cursor); + if (!raw_block) + { + break; + } + PROFILE_INC(db, blocks_read); + + search_data = raw_block->data; + search_data_size = raw_block->size; + + if (db->clock_cache && has_cf_name) + { + uint8_t *indexed_data = NULL; + size_t indexed_size = 0; + if (tidesdb_build_indexed_block_data(search_data, search_data_size, &indexed_data, + &indexed_size) == 0) + { + tidesdb_cache_raw_block_put(db, cf_name, sst->klog_filename, block_position, + indexed_data, indexed_size); + free(indexed_data); + } + else + { + tidesdb_cache_raw_block_put(db, cf_name, sst->klog_filename, block_position, + search_data, search_data_size); + } + } + } + + /* we search the raw bytes directly (zero-copy -- no memcpy needed) */ + tidesdb_klog_entry_t found_entry = {0}; + const uint8_t *found_key = NULL; + const uint8_t *found_value = NULL; + + const int search_rc = tidesdb_klog_block_search_raw( + search_data, search_data_size, key, key_size, seq_ceiling, comparator_fn, + comparator_ctx, &found_entry, &found_key, &found_value); + + if (search_rc == 0) + { + /* found -- we build kv pair from the single entry. + * pointers (found_key, found_value) point into cache memory (pinned), + * so tidesdb_kv_pair_create copies them before we release the pin. */ + const int is_tombstone = (found_entry.flags & TDB_KV_FLAG_TOMBSTONE); + + /* we check TTL before allocating anything */ + if (!is_tombstone && found_entry.ttl > 0) + { + const int64_t now = (int64_t)atomic_load(&db->cached_current_time); + if (now > found_entry.ttl) + { + if (pinned_entry) clock_cache_release(pinned_entry); + if (raw_block) block_manager_block_release(raw_block); + return TDB_ERR_NOT_FOUND; + } + } + + /* in seq-only mode caller passed kv=NULL to signal they only need + * the entry metadata (seq, flags). skip value allocation, vlog reads, + * and kv_pair_create entirely. used by conflict detection. + * the seq is returned via tdb_sst_get_seq_out (file-scope thread-local). */ + if (!kv) + { + if (pinned_entry) clock_cache_release(pinned_entry); + if (raw_block) block_manager_block_release(raw_block); + tdb_sst_get_seq_out = found_entry.seq; + return TDB_SUCCESS; + } + + if (is_tombstone) + { + *kv = tidesdb_kv_pair_create(found_key, found_entry.key_size, NULL, 0, + found_entry.ttl, found_entry.seq, 1); + } + else if (found_entry.vlog_offset > 0) + { + /* vlog value -- we create kv without value, load from vlog */ + *kv = tidesdb_kv_pair_create(found_key, found_entry.key_size, NULL, 0, + found_entry.ttl, found_entry.seq, 0); + if (*kv) + { + (*kv)->entry = found_entry; + (*kv)->entry.flags |= TDB_KV_FLAG_ARENA; + uint8_t *vlog_val = NULL; + if (tidesdb_vlog_read_value(db, sst, found_entry.vlog_offset, + found_entry.value_size, &vlog_val) == TDB_SUCCESS) + { + (*kv)->value = vlog_val; + } + else + { + tidesdb_kv_pair_free(*kv); + *kv = NULL; + } + } + } + else + { + /* inline value -- tidesdb_kv_pair_create copies key+value from pinned memory */ + *kv = tidesdb_kv_pair_create(found_key, found_entry.key_size, found_value, + found_entry.value_size, found_entry.ttl, + found_entry.seq, 0); + if (*kv) + { + const uint8_t arena_flag = (*kv)->entry.flags & TDB_KV_FLAG_ARENA; + (*kv)->entry = found_entry; + (*kv)->entry.flags |= arena_flag; + } + } + + /* we release cache pin and disk block after kv_pair_create has copied the data */ + if (pinned_entry) clock_cache_release(pinned_entry); + if (raw_block) block_manager_block_release(raw_block); + + if (*kv) return TDB_SUCCESS; + return TDB_ERR_MEMORY; + } + + /* not found in this block -- release and try next */ + if (pinned_entry) clock_cache_release(pinned_entry); + + const int cursor_was_advanced = (raw_block != NULL); + if (raw_block) block_manager_block_release(raw_block); + + /* if search returned corruption, stop */ + if (search_rc == -2) + { + break; + } + + blocks_scanned++; + if (!cursor_was_advanced && block_manager_cursor_next(klog_cursor) != 0) + { + break; + } + } + + return TDB_ERR_NOT_FOUND; +} + +/** + * tidesdb_sstable_get_seq + * lightweight variant of tidesdb_sstable_get for conflict detection. + * returns only the sequence number of the matching key without allocating + * a kv pair or copying the value. this avoids the malloc+memcpy+free overhead + * that dominates the commit-time conflict check path. + * @param db database instance + * @param sst sstable to search + * @param key key to look up + * @param key_size key size + * @param out_seq output sequence number (set on success) + * @return TDB_SUCCESS if key found, TDB_ERR_NOT_FOUND otherwise + */ +static int tidesdb_sstable_get_seq(tidesdb_t *db, tidesdb_sstable_t *sst, const uint8_t *key, + const size_t key_size, uint64_t *out_seq) +{ + /* we call tidesdb_sstable_get with kv=NULL to trigger seq-only mode. + * this skips kv_pair_create, value memcpy, and vlog reads. + * the seq is returned via the file-scope thread-local tdb_sst_get_seq_out. + * seq-only mode feeds conflict detection, which needs the true newest + * version, so the ceiling is unbounded. */ + const int result = tidesdb_sstable_get(db, sst, key, key_size, UINT64_MAX, NULL, 0); + if (result == TDB_SUCCESS) + { + *out_seq = tdb_sst_get_seq_out; + return TDB_SUCCESS; + } + return TDB_ERR_NOT_FOUND; +} + +/** + * tidesdb_sstable_load + * load an sstable from disk + * @param db database instance (can be NULL during startup) + * @param sst the sstable to load + * @return 0 on success, non-zero on failure + */ +static int tidesdb_sstable_load(tidesdb_t *db, tidesdb_sstable_t *sst) +{ + /* we open block managers temporarily for loading; they'll be managed by cache later */ + block_manager_t *klog_bm = NULL; + block_manager_t *vlog_bm = NULL; + + if (block_manager_open(&klog_bm, sst->klog_path, convert_sync_mode(sst->config->sync_mode)) != + 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Failed to open klog file %s (may be leftover from incomplete cleanup)", + sst->klog_path); + return -1; + } + + /* we validate klog file (strict mode -- reject any corruption) */ + if (block_manager_validate_last_block(klog_bm, BLOCK_MANAGER_STRICT_BLOCK_VALIDATION) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable klog file %s is corrupted", sst->klog_path); + block_manager_close(klog_bm); + return TDB_ERR_CORRUPTION; + } + + if (block_manager_open(&vlog_bm, sst->vlog_path, convert_sync_mode(sst->config->sync_mode)) != + 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Failed to open vlog file %s (may be leftover from incomplete cleanup)", + sst->vlog_path); + block_manager_close(klog_bm); + return -1; + } + + /* we validate vlog file (strict mode -- reject any corruption) */ + if (block_manager_validate_last_block(vlog_bm, BLOCK_MANAGER_STRICT_BLOCK_VALIDATION) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable vlog file %s is corrupted", sst->vlog_path); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + + block_manager_get_size(klog_bm, &sst->klog_size); + block_manager_get_size(vlog_bm, &sst->vlog_size); + + /* we check for empty or corrupted files */ + if (sst->klog_size == 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Empty klog file %s (corrupted or incomplete SSTable)", + sst->klog_path); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + + /* we read metadata from last block */ + block_manager_cursor_t *metadata_cursor; + int metadata_corrupt = 0; + if (block_manager_cursor_init(&metadata_cursor, klog_bm) == 0) + { + if (block_manager_cursor_goto_last(metadata_cursor) == 0) + { + block_manager_block_t *metadata_block = block_manager_cursor_read(metadata_cursor); + if (metadata_block && metadata_block->size > 0) + { + if (sstable_metadata_deserialize(metadata_block->data, metadata_block->size, sst) == + 0) + { + block_manager_block_release(metadata_block); + block_manager_cursor_free(metadata_cursor); + + if (sst->klog_data_end_offset > 0) + { + if (sst->klog_data_end_offset > sst->klog_size) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "SSTable %s metadata invalid: klog_data_end_offset " + "(%" PRIu64 ") > klog_size (%" PRIu64 ")", + sst->klog_path, sst->klog_data_end_offset, + sst->klog_size); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + + /* we must have at least block manager header before data */ + if (sst->klog_data_end_offset < BLOCK_MANAGER_HEADER_SIZE) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "SSTable %s metadata invalid: klog_data_end_offset " + "(%" PRIu64 ") < header size (%d)", + sst->klog_path, sst->klog_data_end_offset, + BLOCK_MANAGER_HEADER_SIZE); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + } + + /* we validate num_klog_blocks is reasonable */ + if (sst->num_klog_blocks > 0) + { + /* for sanity each block needs at least header + footer */ + uint64_t min_size_per_block = + BLOCK_MANAGER_BLOCK_HEADER_SIZE + BLOCK_MANAGER_FOOTER_SIZE; + uint64_t min_required_size = + BLOCK_MANAGER_HEADER_SIZE + (sst->num_klog_blocks * min_size_per_block); + + if (sst->klog_data_end_offset > 0 && + sst->klog_data_end_offset < min_required_size) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "SSTable %s metadata invalid: claims %" PRIu64 + " blocks but klog_data_end_offset (%" PRIu64 + ") too small (min %" PRIu64 ")", + sst->klog_path, sst->num_klog_blocks, + sst->klog_data_end_offset, min_required_size); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + + /* we validate first data block is readable to detect incomplete ssts */ + block_manager_cursor_t *validate_cursor; + if (block_manager_cursor_init(&validate_cursor, klog_bm) == 0) + { + if (block_manager_cursor_goto_first(validate_cursor) == 0) + { + block_manager_block_t *first_block = + block_manager_cursor_read(validate_cursor); + if (!first_block || first_block->size == 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "SSTable %s first data block unreadable or empty", + sst->klog_path); + if (first_block) block_manager_block_release(first_block); + block_manager_cursor_free(validate_cursor); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + block_manager_block_release(first_block); + } + block_manager_cursor_free(validate_cursor); + } + } + + /* metadata loaded successfully, we skip reading min/max from blocks */ + goto load_bloom_and_index; + } + metadata_corrupt = 1; + block_manager_block_release(metadata_block); + } + } + block_manager_cursor_free(metadata_cursor); + } + + /* if metadata was found but corrupted, or if no metadata block exists, fail immediately */ + if (metadata_corrupt) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, "SSTable metadata corrupted for %s", sst->klog_path); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + } + + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_CORRUPTION; + +load_bloom_and_index:; /* empty statement for C89/C90 compatibility */ + /* load bloom filter and index from last blocks */ + /* [klog blocks...] [index block] [bloom filter block] [metadata block] */ + + if (sst->aux_chunked) + { + /* chunked footer -- bloom and index are located by explicit offset+size + * and may span multiple blocks. reassemble each via read_aux_blob, which + * refuses (NULL) an oversized blob rather than risking OOM. an unreadable + * blob degrades gracefully (no bloom -> full block scan; no index -> + * sequential scan). */ + if (sst->config && sst->config->enable_bloom_filter && sst->bloom_blob_size > 0) + { + uint8_t *bloom_buf = tidesdb_sstable_read_aux_blob(db, klog_bm, sst->bloom_blob_offset, + sst->bloom_blob_size); + if (bloom_buf) + { + sst->bloom_filter = bloom_filter_deserialize(bloom_buf, sst->bloom_blob_size); + free(bloom_buf); + } + else + { + sst->bloom_filter = NULL; + } + } + else + { + sst->bloom_filter = NULL; + } + + if (sst->config && !sst->config->use_btree && sst->index_blob_size > 0) + { + uint8_t *index_buf = tidesdb_sstable_read_aux_blob(db, klog_bm, sst->index_blob_offset, + sst->index_blob_size); + if (index_buf) + { + sst->block_indexes = + compact_block_index_deserialize(index_buf, sst->index_blob_size); + if (sst->block_indexes) + { + sst->block_indexes->comparator = sst->config->comparator_fn_cached; + sst->block_indexes->comparator_ctx = sst->config->comparator_ctx_cached; + } + free(index_buf); + } + } + } + else + { + block_manager_cursor_t *cursor; + if (block_manager_cursor_init(&cursor, klog_bm) != 0) + { + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + return TDB_ERR_IO; + } + + /* we go to last block (metadata) and skip it */ + if (block_manager_cursor_goto_last(cursor) == 0) + { + /* we skip metadata block, go to bloom filter */ + if (block_manager_cursor_prev(cursor) == 0) + { + block_manager_block_t *bloom_block = block_manager_cursor_read(cursor); + if (bloom_block) + { + if (bloom_block->size > 0 && sst->config && sst->config->enable_bloom_filter) + { + sst->bloom_filter = + bloom_filter_deserialize(bloom_block->data, bloom_block->size); + } + else + { + sst->bloom_filter = NULL; + } + block_manager_block_release(bloom_block); + } + + /* go to index block -- skip for btree mode which uses its own + * B+tree traversal and does not need block indexes */ + if (block_manager_cursor_prev(cursor) == 0) + { + block_manager_block_t *index_block = block_manager_cursor_read(cursor); + if (index_block) + { + if (index_block->size > 0 && !sst->config->use_btree) + { + sst->block_indexes = compact_block_index_deserialize(index_block->data, + index_block->size); + + /* we use cached comparator from config (already resolved during CF + * creation) this avoids hash table lookup for every sst during + * recovery */ + if (sst->block_indexes) + { + sst->block_indexes->comparator = sst->config->comparator_fn_cached; + sst->block_indexes->comparator_ctx = + sst->config->comparator_ctx_cached; + } + } + block_manager_block_release(index_block); + } + } + } + } + + block_manager_cursor_free(cursor); + } + + /* we keep block managers open and store them in the sstable + * they will be managed by the cache and closed when the sstable is evicted or freed */ + sst->klog_bm = klog_bm; + sst->vlog_bm = vlog_bm; + + /* we cache resolved comparator on the sstable to avoid per-lookup resolution */ + sst->cached_comparator_fn = NULL; + sst->cached_comparator_ctx = NULL; + sst->is_reverse = 0; + if (db && sst->config) + { + tidesdb_resolve_comparator(db, sst->config, &sst->cached_comparator_fn, + &sst->cached_comparator_ctx); + + /* we cache is_reverse to avoid recomputing on every klog get */ + if (sst->cached_comparator_fn && sst->min_key && sst->max_key) + { + const int min_max_cmp = + sst->cached_comparator_fn(sst->min_key, sst->min_key_size, sst->max_key, + sst->max_key_size, sst->cached_comparator_ctx); + sst->is_reverse = (min_max_cmp > 0); + } + } + + /* we track that this file is now open */ + if (db) + { + atomic_store(&sst->last_access_time, + atomic_load_explicit(&db->cached_current_time, memory_order_relaxed)); + atomic_fetch_add(&db->num_open_sstables, 1); + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_level_create + * create a new level + * @param level_num level number + * @param capacity capacity of level + * @return level on success, NULL on failure + */ +static tidesdb_level_t *tidesdb_level_create(const int level_num, size_t capacity) +{ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Creating level %d with capacity %zu", level_num, capacity); + + tidesdb_level_t *level = calloc(1, sizeof(tidesdb_level_t)); + if (!level) return NULL; + + level->level_num = level_num; + atomic_init(&level->capacity, capacity); + atomic_init(&level->current_size, 0); + + tidesdb_sstable_t **sstables = + calloc(TDB_MIN_LEVEL_SSTABLES_INITIAL_CAPACITY + 1, sizeof(tidesdb_sstable_t *)); + if (!sstables) + { + free(level); + return NULL; + } + + atomic_init(&level->sstables, sstables); + atomic_init(&level->num_sstables, 0); + atomic_init(&level->sstables_capacity, TDB_MIN_LEVEL_SSTABLES_INITIAL_CAPACITY); + atomic_init(&level->num_boundaries, 0); + atomic_init(&level->retired_sstables_arr, NULL); + atomic_init(&level->array_readers, 0); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Level %d created with capacity %zu", level_num, capacity); + + return level; +} + +/** + * tidesdb_level_free + * free a level + * @param db database + * @param level level to free + */ +static void tidesdb_level_free(const tidesdb_t *db, tidesdb_level_t *level) +{ + if (!level) return; + + int num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + tidesdb_sstable_t **ssts = atomic_load_explicit(&level->sstables, memory_order_acquire); + + for (int i = 0; i < num_ssts; i++) + { + if (ssts[i]) + { + /* freeing the level drops these sstables without going through + * tidesdb_level_remove_sstable, so decrement the aux memory total here */ + if (db) + atomic_fetch_sub_explicit(&((tidesdb_t *)db)->sstable_aux_memory_bytes, + tidesdb_sstable_aux_memory_bytes(ssts[i]), + memory_order_relaxed); + tidesdb_sstable_unref(db, ssts[i]); + } + } + + free(ssts); + + /* we free any retired array that was deferred */ + tidesdb_sstable_t **retired = + atomic_load_explicit(&level->retired_sstables_arr, memory_order_acquire); + free(retired); + int num_boundaries = atomic_load_explicit(&level->num_boundaries, memory_order_acquire); + uint8_t **file_boundaries = atomic_load_explicit(&level->file_boundaries, memory_order_acquire); + size_t *boundary_sizes = atomic_load_explicit(&level->boundary_sizes, memory_order_acquire); + + for (int i = 0; i < num_boundaries; i++) + { + free(file_boundaries[i]); /* free individual boundary entries */ + } + + free(file_boundaries); /* then free the array itself */ + free(boundary_sizes); + + free(level); +} + +/** + * tidesdb_deferred_free_enqueue + * push a retired array onto the lock-free deferred free list + * called by flush/compaction workers when they cannot immediately free a retired array + * @param db the database + * @param ptr pointer to free when safe + * @param level level whose array_readers must reach 0 before freeing ptr + */ +static void tidesdb_deferred_free_enqueue(tidesdb_t *db, void *ptr, tidesdb_level_t *level) +{ + tidesdb_deferred_free_node_t *node = malloc(sizeof(tidesdb_deferred_free_node_t)); + if (!node) + { + /* last resort spin-wait and free inline if allocation fails */ + while (atomic_load_explicit(&level->array_readers, memory_order_acquire) > 0) + { + cpu_yield(); + } + free(ptr); + return; + } + + node->ptr = ptr; + node->level = level; + node->sst_unrefs = NULL; + node->sst_unrefs_count = 0; + node->db = NULL; + + /* we push onto head of singly-linked list */ + tidesdb_deferred_free_node_t *old_head = + atomic_load_explicit(&db->deferred_free_list, memory_order_acquire); + do + { + node->next = old_head; + } while (!atomic_compare_exchange_weak_explicit(&db->deferred_free_list, &old_head, node, + memory_order_release, memory_order_acquire)); +} + +/** + * tidesdb_deferred_free_sweep + * sweep the deferred free list, freeing entries whose level has no active readers + * entries that still have active readers are re-enqueued for the next sweep + * called periodically by the reaper thread + * @param db the database + */ +static void tidesdb_deferred_free_sweep(tidesdb_t *db) +{ + /* atomically steal the entire list */ + tidesdb_deferred_free_node_t *list = + atomic_exchange_explicit(&db->deferred_free_list, NULL, memory_order_acq_rel); + + if (!list) return; + + tidesdb_deferred_free_node_t *current = list; + while (current) + { + tidesdb_deferred_free_node_t *next = current->next; + + if (atomic_load_explicit(¤t->level->array_readers, memory_order_acquire) == 0) + { + /* safe to free -- also unref any deferred sstables */ + if (current->sst_unrefs_count > 0 && current->sst_unrefs) + { + for (int i = 0; i < current->sst_unrefs_count; i++) + { + tidesdb_sstable_unref(current->db, current->sst_unrefs[i]); + } + free(current->sst_unrefs); + } + free(current->ptr); + free(current); + } + else + { + /* still has readers, re-enqueue */ + tidesdb_deferred_free_node_t *old_head = + atomic_load_explicit(&db->deferred_free_list, memory_order_acquire); + do + { + current->next = old_head; + } while (!atomic_compare_exchange_weak_explicit(&db->deferred_free_list, &old_head, + current, memory_order_release, + memory_order_acquire)); + } + + current = next; + } +} + +/** + * tidesdb_deferred_free_drain + * force-drain all entries in the deferred free list (used during shutdown) + * spins briefly on each entry until array_readers reaches 0 + * @param db the database + */ +static void tidesdb_deferred_free_drain(tidesdb_t *db) +{ + tidesdb_deferred_free_node_t *list = + atomic_exchange_explicit(&db->deferred_free_list, NULL, memory_order_acq_rel); + + while (list) + { + tidesdb_deferred_free_node_t *next = list->next; + + while (atomic_load_explicit(&list->level->array_readers, memory_order_acquire) > 0) + { + cpu_yield(); + } + + if (list->sst_unrefs_count > 0 && list->sst_unrefs) + { + for (int i = 0; i < list->sst_unrefs_count; i++) + { + tidesdb_sstable_unref(list->db, list->sst_unrefs[i]); + } + free(list->sst_unrefs); + } + free(list->ptr); + free(list); + list = next; + } +} + +/** + * tidesdb_deferred_free_drain_for_cf + * drain deferred free entries whose level belongs to the given cf so the cf's + * levels can be released without the reaper later dereferencing a freed level. + * walks the list once, frees items pointing at any of cf->levels[i], and + * re-enqueues everything else for the regular reaper sweep to handle. + * + * the caller must hold db->reaper_thread_mutex around this call -- otherwise + * the reaper could be mid-walk holding items for this cf in its locally-stolen + * list and UAF on level->array_readers once tidesdb_column_family_free + * releases the level structs. + * + * @param db database handle + * @param cf column family whose pending items should be drained now + */ +static void tidesdb_deferred_free_drain_for_cf(tidesdb_t *db, tidesdb_column_family_t *cf) +{ + tidesdb_deferred_free_node_t *list = + atomic_exchange_explicit(&db->deferred_free_list, NULL, memory_order_acq_rel); + tidesdb_deferred_free_node_t *keep = NULL; + + while (list) + { + tidesdb_deferred_free_node_t *next = list->next; + + int is_ours = 0; + for (int i = 0; i < TDB_MAX_LEVELS; i++) + { + if (cf->levels[i] && cf->levels[i] == list->level) + { + is_ours = 1; + break; + } + } + + if (is_ours) + { + /* drop has already drained is_compacting, writers, and + * active_mt_readers, and the caller contract is that no + * iterators/gets remain on a dropped cf, so array_readers + * should be 0. spin defensively in case the contract was + * violated -- better to hang drop than UAF */ + while (atomic_load_explicit(&list->level->array_readers, memory_order_acquire) > 0) + { + cpu_yield(); + } + if (list->sst_unrefs_count > 0 && list->sst_unrefs) + { + for (int i = 0; i < list->sst_unrefs_count; i++) + { + tidesdb_sstable_unref(list->db, list->sst_unrefs[i]); + } + free(list->sst_unrefs); + } + free(list->ptr); + free(list); + } + else + { + list->next = keep; + keep = list; + } + list = next; + } + + /* re-enqueue items we kept onto the lock-free list */ + while (keep) + { + tidesdb_deferred_free_node_t *next = keep->next; + tidesdb_deferred_free_node_t *old_head = + atomic_load_explicit(&db->deferred_free_list, memory_order_acquire); + do + { + keep->next = old_head; + } while (!atomic_compare_exchange_weak_explicit( + &db->deferred_free_list, &old_head, keep, memory_order_release, memory_order_acquire)); + keep = next; + } +} + +/** + * tidesdb_retire_array + * retire an old sstable array pointer, attempting a brief spin before deferring + * @param db the database (NULL to force inline spin-wait) + * @param prev_retired the previously retired array to free + * @param level the level whose array_readers guards this pointer + */ +static void tidesdb_retire_array(tidesdb_t *db, void *prev_retired, tidesdb_level_t *level) +{ + if (!prev_retired) return; + + /* brief spin, handles the common case where readers finish quickly */ + for (int i = 0; i < TDB_DEFERRED_FREE_SPIN_ATTEMPTS; i++) + { + if (atomic_load_explicit(&level->array_readers, memory_order_acquire) == 0) + { + free(prev_retired); + return; + } + cpu_pause(); + } + + /* readers still active after brief spin -- we defer to reaper thread */ + if (db) + { + tidesdb_deferred_free_enqueue(db, prev_retired, level); + } + else + { + /* no db handle, must spin (should not happen in practice) */ + while (atomic_load_explicit(&level->array_readers, memory_order_acquire) > 0) + { + cpu_yield(); + } + free(prev_retired); + } +} + +/** + * tidesdb_defer_removed_sst_unref + * defer the unref of a removed sstable until array_readers drains to 0 + * prevents use-after-free when readers hold raw pointers from the old array + * @param db the database + * @param level the level whose array_readers guards reader access + * @param sst the removed sstable to defer unreffing + */ +static void tidesdb_defer_removed_sst_unref(tidesdb_t *db, tidesdb_level_t *level, + tidesdb_sstable_t *sst) +{ + /* brief spin -- handles common case where readers finish quickly */ + for (int i = 0; i < TDB_DEFERRED_FREE_SPIN_ATTEMPTS; i++) + { + if (atomic_load_explicit(&level->array_readers, memory_order_acquire) == 0) + { + tidesdb_sstable_unref(db, sst); + return; + } + cpu_pause(); + } + + /* readers still active,we defer to reaper thread */ + tidesdb_deferred_free_node_t *node = malloc(sizeof(tidesdb_deferred_free_node_t)); + tidesdb_sstable_t **unrefs = node ? malloc(sizeof(tidesdb_sstable_t *)) : NULL; + + if (!node || !unrefs) + { + /* allocation failed, we must spin-wait */ + while (atomic_load_explicit(&level->array_readers, memory_order_acquire) > 0) cpu_yield(); + tidesdb_sstable_unref(db, sst); + free(node); + free(unrefs); + return; + } + + unrefs[0] = sst; + node->ptr = NULL; + node->level = level; + node->sst_unrefs = unrefs; + node->sst_unrefs_count = 1; + node->db = db; + + tidesdb_deferred_free_node_t *old_head = + atomic_load_explicit(&db->deferred_free_list, memory_order_acquire); + do + { + node->next = old_head; + } while (!atomic_compare_exchange_weak_explicit(&db->deferred_free_list, &old_head, node, + memory_order_release, memory_order_acquire)); +} + +/** + * tidesdb_sstable_aux_memory_bytes + * resident bloom filter + block index memory of one sstable. bloom filters and + * block indexes are immutable for the sstable's lifetime, so this is stable + * between level add and level remove and the running total stays exact. + * @param sst sstable + * @return bloom filter + block index bytes + */ +static int64_t tidesdb_sstable_aux_memory_bytes(const tidesdb_sstable_t *sst) +{ + int64_t bytes = 0; + if (sst->bloom_filter) + { + bytes += + (int64_t)(sst->bloom_filter->size_in_words * sizeof(uint64_t) + sizeof(bloom_filter_t)); + } + if (sst->block_indexes) + { + bytes += (int64_t)((size_t)sst->block_indexes->count * + (sst->block_indexes->prefix_len * 2 + sizeof(uint64_t)) + + sizeof(tidesdb_block_index_t)); + } + return bytes; +} + +/** + * tidesdb_level_add_sstable + * add an sstable to a level + * @param level level to add sstable to + * @param sst sstable to add + * @return 0 on success, non-zero on failure + */ +static int tidesdb_level_add_sstable(tidesdb_level_t *level, tidesdb_sstable_t *sst) +{ + /* we upload sstable files synchronously before tracking in the local cache. + * this ensures the object store has a copy before cache eviction can delete + * the local file (the eviction path unlinks cold files from disk). + * a replica never creates sstables -- its adds come from sync/cold-start and + * already exist remotely, so re-uploading wastes bandwidth and, on an incomplete + * local copy, could clobber good remote data. it can also always re-fetch on + * eviction, so only primaries push to the store. */ + if (sst->db && sst->db->object_store) + { + if (!atomic_load_explicit(&sst->db->replica_mode, memory_order_acquire)) + { + tdb_objstore_upload_file_sync(sst->db, sst->klog_path); + tdb_objstore_upload_file_sync(sst->db, sst->vlog_path); + } + if (sst->db->local_cache) + { + tdb_local_cache_track(sst->db->local_cache, sst->klog_path); + tdb_local_cache_track(sst->db->local_cache, sst->vlog_path); + } + } + + tidesdb_sstable_ref(sst); + + while (1) + { + /* we hold array_readers while accessing the sstables array to prevent + * tidesdb_retire_array from freeing the array under us */ + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + int old_num = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + int old_capacity = atomic_load_explicit(&level->sstables_capacity, memory_order_acquire); + tidesdb_sstable_t **old_arr = atomic_load_explicit(&level->sstables, memory_order_acquire); + + /* we spin until (old_arr, old_num) are consistent + * another writer may have CAS'd a new array but not yet updated num_sstables */ + if (old_arr[old_num] != NULL) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + if (old_num > 0 && old_arr[old_num - 1] == NULL) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + + /* we check if we need to grow the array */ + if (old_num >= old_capacity) + { + int new_capacity = + old_capacity == 0 ? TDB_MIN_LEVEL_SSTABLES_INITIAL_CAPACITY : old_capacity * 2; + tidesdb_sstable_t **new_arr = calloc(new_capacity + 1, sizeof(tidesdb_sstable_t *)); + if (!new_arr) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + tidesdb_sstable_unref(sst->db, sst); + return TDB_ERR_MEMORY; + } + + memcpy(new_arr, old_arr, old_num * sizeof(tidesdb_sstable_t *)); + + new_arr[old_num] = sst; + + if (atomic_compare_exchange_strong_explicit(&level->sstables, &old_arr, new_arr, + memory_order_release, memory_order_acquire)) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + atomic_store_explicit(&level->sstables_capacity, new_capacity, + memory_order_release); + atomic_store_explicit(&level->num_sstables, old_num + 1, memory_order_release); + + atomic_fetch_add_explicit(&level->current_size, sst->klog_size + sst->vlog_size, + memory_order_relaxed); + atomic_fetch_add_explicit(&sst->db->sstable_aux_memory_bytes, + tidesdb_sstable_aux_memory_bytes(sst), + memory_order_relaxed); + + tidesdb_sstable_t **prev_retired = atomic_exchange_explicit( + &level->retired_sstables_arr, old_arr, memory_order_acq_rel); + tidesdb_retire_array(sst->db, prev_retired, level); + + return TDB_SUCCESS; + } + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + free(new_arr); + } + else + { + int expected = old_num; + + if (expected >= old_capacity) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + continue; + } + + tidesdb_sstable_t **new_arr = calloc(old_capacity + 1, sizeof(tidesdb_sstable_t *)); + if (!new_arr) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + tidesdb_sstable_unref(sst->db, sst); + return TDB_ERR_MEMORY; + } + + memcpy(new_arr, old_arr, old_num * sizeof(tidesdb_sstable_t *)); + new_arr[old_num] = sst; + + if (atomic_compare_exchange_strong_explicit(&level->sstables, &old_arr, new_arr, + memory_order_release, memory_order_acquire)) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + atomic_thread_fence(memory_order_seq_cst); + atomic_store_explicit(&level->num_sstables, old_num + 1, memory_order_release); + + atomic_fetch_add_explicit(&level->current_size, sst->klog_size + sst->vlog_size, + memory_order_relaxed); + atomic_fetch_add_explicit(&sst->db->sstable_aux_memory_bytes, + tidesdb_sstable_aux_memory_bytes(sst), + memory_order_relaxed); + + tidesdb_sstable_t **prev_retired = atomic_exchange_explicit( + &level->retired_sstables_arr, old_arr, memory_order_acq_rel); + tidesdb_retire_array(sst->db, prev_retired, level); + + return TDB_SUCCESS; + } + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + free(new_arr); + } + } +} + +/** + * tidesdb_level_remove_sstable + * remove an sstable from a level + * @param db database instance (for cache removal) + * @param level level to remove sstable from + * @param sst sstable to remove + * @return 0 on success, non-zero on failure + */ +static int tidesdb_level_remove_sstable(const tidesdb_t *db, tidesdb_level_t *level, + tidesdb_sstable_t *sst) +{ + while (1) + { + /* we hold array_readers while accessing the sstables array to prevent + * tidesdb_retire_array from freeing the array under us. without this, + * a concurrent remove on the same level could CAS a new array, retire + * the old one, see array_readers==0, and free it while we still hold + * a raw pointer -- causing a use-after-free crash. */ + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + int old_num = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + const int old_capacity = + atomic_load_explicit(&level->sstables_capacity, memory_order_acquire); + tidesdb_sstable_t **old_arr = atomic_load_explicit(&level->sstables, memory_order_acquire); + + /* we spin until (old_arr, old_num) are consistent + * another writer may have CAS'd a new array but not yet updated num_sstables */ + if (old_arr[old_num] != NULL) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + if (old_num > 0 && old_arr[old_num - 1] == NULL) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + + int found_idx = -1; + for (int i = 0; i < old_num; i++) + { + if (old_arr[i] == sst) + { + found_idx = i; + break; + } + } + + if (found_idx == -1) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return TDB_ERR_NOT_FOUND; + } + + tidesdb_sstable_t **new_arr = calloc(old_capacity + 1, sizeof(tidesdb_sstable_t *)); + if (!new_arr) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return TDB_ERR_MEMORY; + } + + int new_idx = 0; + for (int i = 0; i < old_num; i++) + { + if (i != found_idx) + { + new_arr[new_idx] = old_arr[i]; + tidesdb_sstable_ref(new_arr[new_idx]); + new_idx++; + } + } + + /* for remove -- swap array first, then update count + * readers use pattern -- load array, load count, re-load count, use min(count1, count2) + * this handles both add-with-resize (array changes, count increases) and + * remove (array changes, count decreases) races safely */ + if (atomic_compare_exchange_strong_explicit(&level->sstables, &old_arr, new_arr, + memory_order_release, memory_order_acquire)) + { + /* array swapped.. we release reader count before retiring since + * retire_array checks array_readers==0 to decide whether to free */ + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + /* we now update count */ + atomic_thread_fence(memory_order_seq_cst); + atomic_store_explicit(&level->num_sstables, new_idx, memory_order_release); + + /* success! we update size */ + atomic_fetch_sub_explicit(&level->current_size, sst->klog_size + sst->vlog_size, + memory_order_relaxed); + atomic_fetch_sub_explicit(&((tidesdb_t *)db)->sstable_aux_memory_bytes, + tidesdb_sstable_aux_memory_bytes(sst), memory_order_relaxed); + + /* we unref old array's surviving sstables immediately (safe--new array holds refs) + * but skip the removed sstable -- readers may still hold raw pointers from old array + * and would hit use-after-free if we unref it to 0 before they call try_ref */ + for (int i = 0; i < old_num; i++) + { + if (i == found_idx) continue; + tidesdb_sstable_unref(db, old_arr[i]); + } + + tidesdb_sstable_t **prev_retired = atomic_exchange_explicit( + &level->retired_sstables_arr, old_arr, memory_order_acq_rel); + tidesdb_retire_array((tidesdb_t *)db, prev_retired, level); + + /* we defer the removed sstables unref until array_readers drains to 0 */ + tidesdb_defer_removed_sst_unref((tidesdb_t *)db, level, sst); + + return TDB_SUCCESS; + } + /* CAS failed, we release reader count, cleanup and retry */ + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + for (int i = 0; i < new_idx; i++) + { + tidesdb_sstable_unref(db, new_arr[i]); + } + free(new_arr); + } +} + +/** + * tidesdb_level_remove_sstables_batch + * excise every sstable in the to_remove set that is currently in this level, in a single + * atomic array swap. removing a merge's same-level inputs one at a time lets a concurrent + * point get observe a level holding an input's older put without its tombstone -- the get + * stops at the first level that has the key, so it returns the orphaned put and a deleted + * key reappears until compaction settles. one swap means a reader sees all of this level's + * merged inputs or none of them. + * @param db database instance + * @param level level to remove from + * @param to_remove set of sstables to remove + * @param to_remove_count size of the set + * @param out_removed per-entry flags, set to 1 for each to_remove[j] excised here + * @return TDB_SUCCESS if any were removed, TDB_ERR_NOT_FOUND if none, TDB_ERR_MEMORY on alloc fail + */ +static int tidesdb_level_remove_sstables_batch(const tidesdb_t *db, tidesdb_level_t *level, + tidesdb_sstable_t **to_remove, int to_remove_count, + uint8_t *out_removed) +{ + while (1) + { + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + int old_num = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + const int old_capacity = + atomic_load_explicit(&level->sstables_capacity, memory_order_acquire); + tidesdb_sstable_t **old_arr = atomic_load_explicit(&level->sstables, memory_order_acquire); + + /* we spin until (old_arr, old_num) are consistent -- see tidesdb_level_remove_sstable */ + if (old_arr[old_num] != NULL) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + if (old_num > 0 && old_arr[old_num - 1] == NULL) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + + tidesdb_sstable_t **new_arr = calloc(old_capacity + 1, sizeof(tidesdb_sstable_t *)); + if (!new_arr) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return TDB_ERR_MEMORY; + } + + int new_idx = 0; + int removed_here = 0; + for (int i = 0; i < old_num; i++) + { + int rm = 0; + for (int j = 0; j < to_remove_count; j++) + { + if (old_arr[i] == to_remove[j]) + { + rm = 1; + break; + } + } + if (rm) + { + removed_here++; + continue; + } + new_arr[new_idx] = old_arr[i]; + tidesdb_sstable_ref(new_arr[new_idx]); + new_idx++; + } + + if (removed_here == 0) + { + /* none of the targets are in this level -- new_arr already holds a ref on every + * survivor from the build loop above; drop those before discarding new_arr or the + * level's sstables leak a ref each (cleanup calls this for every level in range) */ + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + for (int i = 0; i < new_idx; i++) + { + tidesdb_sstable_unref(db, new_arr[i]); + } + free(new_arr); + return TDB_ERR_NOT_FOUND; + } + + if (atomic_compare_exchange_strong_explicit(&level->sstables, &old_arr, new_arr, + memory_order_release, memory_order_acquire)) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + atomic_thread_fence(memory_order_seq_cst); + atomic_store_explicit(&level->num_sstables, new_idx, memory_order_release); + + /* old_arr must be fully consumed before the atomic_exchange below puts it into + * retired_sstables_arr -- once retired, a concurrent tidesdb_level_add_sstable + * can retire it again and free it, so reading old_arr afterward is a use after + * free. we unref the survivors and resolve every removed sstable here first. */ + for (int i = 0; i < old_num; i++) + { + int rm = 0; + for (int j = 0; j < to_remove_count; j++) + { + if (old_arr[i] == to_remove[j]) + { + rm = 1; + break; + } + } + if (!rm) tidesdb_sstable_unref(db, old_arr[i]); + } + + /* for each sstable excised in this swap we account the freed space and defer its + * unref until array_readers drains, since readers may still hold raw pointers + * from the old array */ + for (int j = 0; j < to_remove_count; j++) + { + int was_here = 0; + for (int i = 0; i < old_num; i++) + { + if (old_arr[i] == to_remove[j]) + { + was_here = 1; + break; + } + } + if (!was_here) continue; + out_removed[j] = 1; + atomic_fetch_sub_explicit(&level->current_size, + to_remove[j]->klog_size + to_remove[j]->vlog_size, + memory_order_relaxed); + atomic_fetch_sub_explicit(&((tidesdb_t *)db)->sstable_aux_memory_bytes, + tidesdb_sstable_aux_memory_bytes(to_remove[j]), + memory_order_relaxed); + tidesdb_defer_removed_sst_unref((tidesdb_t *)db, level, to_remove[j]); + } + + tidesdb_sstable_t **prev_retired = atomic_exchange_explicit( + &level->retired_sstables_arr, old_arr, memory_order_acq_rel); + tidesdb_retire_array((tidesdb_t *)db, prev_retired, level); + + return TDB_SUCCESS; + } + + /* CAS failed, we release reader count, cleanup and retry */ + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + for (int i = 0; i < new_idx; i++) + { + tidesdb_sstable_unref(db, new_arr[i]); + } + free(new_arr); + } +} + +/** + * tidesdb_level_sort_by_min_key + * reorder a level's sstable array ascending by min_key via a single CAS swap. + * the spooky 4.3 skew optimization leaves skipped largest-level files at their + * old slots while merged partitions append new files, so the array can fall out + * of key order -- the next partitioned merge derives its partition boundaries + * from this array and needs it sorted. the caller holds the cf compaction lock + * so no other writer races; concurrent readers are safe across the CAS. + * @param db database instance + * @param level level whose sstable array is reordered + * @param cmp resolved comparator + * @param cmp_ctx resolved comparator context + */ +static int tidesdb_level_sort_by_min_key(tidesdb_t *db, tidesdb_level_t *level, + skip_list_comparator_fn cmp, void *cmp_ctx) +{ + while (1) + { + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + const int num = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + const int capacity = atomic_load_explicit(&level->sstables_capacity, memory_order_acquire); + tidesdb_sstable_t **old_arr = atomic_load_explicit(&level->sstables, memory_order_acquire); + + /* we spin until (old_arr, num) are consistent -- a concurrent writer + * may have CAS'd a new array but not yet updated num_sstables */ + if (old_arr[num] != NULL || (num > 0 && old_arr[num - 1] == NULL)) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + cpu_pause(); + continue; + } + + if (num < 2) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return TDB_SUCCESS; + } + + tidesdb_sstable_t **new_arr = calloc(capacity + 1, sizeof(tidesdb_sstable_t *)); + if (!new_arr) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return TDB_ERR_MEMORY; + } + memcpy(new_arr, old_arr, num * sizeof(tidesdb_sstable_t *)); + + /* insertion sort -- a level holds few sstables and most are already in + * order, so this is effectively linear */ + for (int i = 1; i < num; i++) + { + tidesdb_sstable_t *cur = new_arr[i]; + int j = i - 1; + while (j >= 0 && new_arr[j] && cur && new_arr[j]->min_key && cur->min_key && + cmp(new_arr[j]->min_key, new_arr[j]->min_key_size, cur->min_key, + cur->min_key_size, cmp_ctx) > 0) + { + new_arr[j + 1] = new_arr[j]; + j--; + } + new_arr[j + 1] = cur; + } + + for (int i = 0; i < num; i++) tidesdb_sstable_ref(new_arr[i]); + + if (atomic_compare_exchange_strong_explicit(&level->sstables, &old_arr, new_arr, + memory_order_release, memory_order_acquire)) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + /* the sstable set is unchanged -- num_sstables and current_size stay + * the same, only the order differs */ + for (int i = 0; i < num; i++) tidesdb_sstable_unref(db, old_arr[i]); + + tidesdb_sstable_t **prev_retired = atomic_exchange_explicit( + &level->retired_sstables_arr, old_arr, memory_order_acq_rel); + tidesdb_retire_array(db, prev_retired, level); + return TDB_SUCCESS; + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + for (int i = 0; i < num; i++) tidesdb_sstable_unref(db, new_arr[i]); + free(new_arr); + } +} + +/** + * tidesdb_bump_sstable_layout_version + * atomically increments the sstable layout version to signal iterators to rebuild caches + * @param cf column family + */ +static void tidesdb_bump_sstable_layout_version(tidesdb_column_family_t *cf) +{ + atomic_fetch_add_explicit(&cf->sstable_layout_version, 1, memory_order_release); +} + +/** + * tidesdb_level_update_boundaries + * update the boundaries of a level + * @param level level to update boundaries for + * @param largest_level largest level + * @return 0 on success, non-zero on failure + */ +static int tidesdb_level_update_boundaries(tidesdb_level_t *level, tidesdb_level_t *largest_level) +{ + uint8_t **file_boundaries = atomic_load_explicit(&level->file_boundaries, memory_order_acquire); + int num_boundaries = atomic_load_explicit(&level->num_boundaries, memory_order_acquire); + size_t *boundary_sizes = atomic_load_explicit(&level->boundary_sizes, memory_order_acquire); + + if (file_boundaries) + { + for (int i = 0; i < num_boundaries; i++) + { + if (file_boundaries[i] == NULL) continue; + free(file_boundaries[i]); + } + + free(file_boundaries); /* already inside if (file_boundaries) block */ + } + + if (boundary_sizes) + { + free(boundary_sizes); + } + + int num_ssts = atomic_load_explicit(&largest_level->num_sstables, memory_order_relaxed); + tidesdb_sstable_t **sstables = + atomic_load_explicit(&largest_level->sstables, memory_order_relaxed); + + if (num_ssts > 0) + { + file_boundaries = malloc(num_ssts * sizeof(uint8_t *)); + boundary_sizes = malloc(num_ssts * sizeof(size_t)); + + if (!file_boundaries || !boundary_sizes) + { + free(file_boundaries); + free(boundary_sizes); + /* we must NULL out level pointers since we already freed the old ones above + * leaving stale pointers could cause use-after-free in dividing_merge */ + atomic_store_explicit(&level->file_boundaries, NULL, memory_order_relaxed); + atomic_store_explicit(&level->boundary_sizes, NULL, memory_order_relaxed); + atomic_store_explicit(&level->num_boundaries, 0, memory_order_relaxed); + return TDB_ERR_MEMORY; + } + + for (int i = 0; i < num_ssts; i++) + { + tidesdb_sstable_t *sst = sstables[i]; + + boundary_sizes[i] = sst->min_key_size; + + file_boundaries[i] = malloc(sst->min_key_size); + if (!file_boundaries[i]) + { + /* we cleanup partially allocated boundaries */ + for (int j = 0; j < i; j++) + { + free(file_boundaries[j]); + } + free(file_boundaries); + free(boundary_sizes); + atomic_store_explicit(&level->file_boundaries, NULL, memory_order_relaxed); + atomic_store_explicit(&level->boundary_sizes, NULL, memory_order_relaxed); + atomic_store_explicit(&level->num_boundaries, 0, memory_order_relaxed); + return TDB_ERR_MEMORY; + } + if (sst->min_key && sst->min_key_size > 0) + { + memcpy(file_boundaries[i], sst->min_key, sst->min_key_size); + } + } + } + else + { + file_boundaries = NULL; + boundary_sizes = NULL; + } + atomic_store_explicit(&level->file_boundaries, file_boundaries, memory_order_relaxed); + atomic_store_explicit(&level->boundary_sizes, boundary_sizes, memory_order_relaxed); + atomic_store_explicit(&level->num_boundaries, num_ssts, memory_order_relaxed); + return TDB_SUCCESS; +} + +/** + * heap_swap + * swap two elements in a heap + * @param a first element + * @param b second element + */ +static void heap_swap(tidesdb_merge_source_t **a, tidesdb_merge_source_t **b) +{ + tidesdb_merge_source_t *temp = *a; + *a = *b; + *b = temp; +} + +/** + * heap_compare + * compare two elements in a heap + * @param heap heap to compare + * @param i index of first element + * @param j index of second element + * @return comparison result + */ +static int heap_compare(const tidesdb_merge_heap_t *heap, const int i, const int j) +{ + tidesdb_kv_pair_t *a = heap->sources[i]->current_kv; + tidesdb_kv_pair_t *b = heap->sources[j]->current_kv; + + if (!a && !b) return 0; + if (!a) return 1; /* a is greater, push to end */ + if (!b) return -1; /* b is greater, push to end */ + + const int cmp = heap->comparator(a->key, a->entry.key_size, b->key, b->entry.key_size, + heap->comparator_ctx); + + if (cmp == 0) + { + /* same key, we prefer higher sequence number (newer) */ + if (a->entry.seq > b->entry.seq) return -1; + if (a->entry.seq < b->entry.seq) return 1; + } + + return cmp; +} + +/** + * heap_compare_max + * compare two elements in a max-heap + * for equal keys, prefer higher sequence number (newer) on top + * this ensures tombstones (seq=UINT64_MAX) are popped before committed values + * @param heap heap containing elements + * @param i index of first element + * @param j index of second element + * @return comparison result + */ +static int heap_compare_max(const tidesdb_merge_heap_t *heap, const int i, const int j) +{ + tidesdb_kv_pair_t *a = heap->sources[i]->current_kv; + tidesdb_kv_pair_t *b = heap->sources[j]->current_kv; + + if (!a && !b) return 0; + if (!a) return -1; /* a is smaller, push to end in max-heap */ + if (!b) return 1; /* b is smaller, push to end in max-heap */ + + const int cmp = heap->comparator(a->key, a->entry.key_size, b->key, b->entry.key_size, + heap->comparator_ctx); + + if (cmp == 0) + { + /* same key, we prefer higher sequence number (newer) on top of max-heap */ + if (a->entry.seq > b->entry.seq) return 1; + if (a->entry.seq < b->entry.seq) return -1; + } + + return cmp; +} + +/** + * heap_sift_down + * sift down an element in a heap + * @param heap heap to sift down + * @param idx index of element to sift down + */ +static void heap_sift_down(const tidesdb_merge_heap_t *heap, int idx) +{ + while (idx * 2 + 1 < heap->num_sources) + { + const int left = idx * 2 + 1; + const int right = idx * 2 + 2; + int smallest = idx; + + if (left < heap->num_sources && heap_compare(heap, left, smallest) < 0) + { + smallest = left; + } + if (right < heap->num_sources && heap_compare(heap, right, smallest) < 0) + { + smallest = right; + } + + if (smallest == idx) break; + + heap_swap(&heap->sources[idx], &heap->sources[smallest]); + idx = smallest; + } +} + +/** + * heap_sift_up + * sift up an element in a heap + * @param heap heap to sift up + * @param idx index of element to sift up + */ +static void heap_sift_up(const tidesdb_merge_heap_t *heap, int idx) +{ + while (idx > 0) + { + const int parent = (idx - 1) / 2; + if (heap_compare(heap, idx, parent) >= 0) break; + + heap_swap(&heap->sources[idx], &heap->sources[parent]); + idx = parent; + } +} + +/** + * heap_sift_down_max + * sift down an element in a max-heap (largest on top) + * @param heap heap to sift down + * @param idx index of element to sift down + */ +static void heap_sift_down_max(const tidesdb_merge_heap_t *heap, int idx) +{ + while (idx * 2 + 1 < heap->num_sources) + { + const int left = idx * 2 + 1; + const int right = idx * 2 + 2; + int largest = idx; + + /* for max-heap, we want largest element on top */ + if (left < heap->num_sources && heap_compare_max(heap, left, largest) > 0) + { + largest = left; + } + if (right < heap->num_sources && heap_compare_max(heap, right, largest) > 0) + { + largest = right; + } + + if (largest == idx) break; + + heap_swap(&heap->sources[idx], &heap->sources[largest]); + idx = largest; + } +} + +/** + * tidesdb_merge_heap_pop_max + * pop the largest element from a max-heap + * @param heap heap to pop from + * @return pointer to the largest kv pair + */ +static tidesdb_kv_pair_t *tidesdb_merge_heap_pop_max(tidesdb_merge_heap_t *heap) +{ + if (heap->num_sources == 0) return NULL; + + tidesdb_merge_source_t *top = heap->sources[0]; + if (!top->current_kv) + { + /* top source exhausted, remove it */ + if (!top->is_cached) + { + tidesdb_merge_source_free(top); + } + heap->sources[0] = heap->sources[heap->num_sources - 1]; + heap->num_sources--; + if (heap->num_sources > 1) heap_sift_down_max(heap, 0); + return NULL; + } + + /* we transfer ownership instead of cloning (same as pop). + * for borrowed (inline) kv pairs, materialize an owned copy since the source + * struct may be freed if retreat fails. */ + tidesdb_kv_pair_t *result = top->current_kv; + if (result && (result->entry.flags & TDB_KV_FLAG_BORROWED)) + { + const uint32_t ks = result->entry.key_size; + const uint32_t vs = (result->value) ? result->entry.value_size : 0; + const size_t needed = sizeof(tidesdb_kv_pair_t) + ks + vs; + + /* we use pre-allocated pop buffer when available to avoid malloc */ + if (heap->pop_buf[0]) + { + const int slot = heap->pop_buf_slot; + if (heap->pop_buf_cap[slot] < needed) + { + const size_t new_cap = (needed > TDB_MERGE_POP_BUF_INITIAL_CAP) + ? needed + : TDB_MERGE_POP_BUF_INITIAL_CAP; + uint8_t *nb = realloc(heap->pop_buf[slot], new_cap); + if (nb) + { + heap->pop_buf[slot] = nb; + heap->pop_buf_cap[slot] = new_cap; + } + } + + if (heap->pop_buf_cap[slot] >= needed) + { + uint8_t *buf = heap->pop_buf[slot]; + tidesdb_kv_pair_t *bkv = (tidesdb_kv_pair_t *)buf; + + bkv->entry = result->entry; + bkv->entry.flags = + (result->entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK) | TDB_KV_FLAG_POP_BUF; + bkv->key = buf + sizeof(tidesdb_kv_pair_t); + memcpy(bkv->key, result->key, ks); + if (vs > 0) + { + bkv->value = bkv->key + ks; + memcpy(bkv->value, result->value, vs); + } + else + { + bkv->value = NULL; + } + result = bkv; + } + else + { + result = tidesdb_kv_pair_create(result->key, result->entry.key_size, result->value, + result->entry.value_size, result->entry.ttl, + result->entry.seq, + result->entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK); + } + } + else + { + result = tidesdb_kv_pair_create(result->key, result->entry.key_size, result->value, + result->entry.value_size, result->entry.ttl, + result->entry.seq, + result->entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK); + } + } + top->current_kv = NULL; + + /* the source to get its previous entry */ + if (tidesdb_merge_source_retreat(top) != TDB_SUCCESS) + { + /* source exhausted, we remove it */ + if (!top->is_cached) + { + tidesdb_merge_source_free(top); + } + heap->sources[0] = heap->sources[heap->num_sources - 1]; + heap->num_sources--; + } + + /* restore max-heap property */ + if (heap->num_sources > 1) heap_sift_down_max(heap, 0); + + return result; +} + +/** + * tidesdb_merge_heap_create + * create a new merge heap + * @param comparator comparator function + * @param comparator_ctx comparator context + * @return pointer to the new merge heap + */ +static tidesdb_merge_heap_t *tidesdb_merge_heap_create(const skip_list_comparator_fn comparator, + void *comparator_ctx) +{ + tidesdb_merge_heap_t *heap = calloc(1, sizeof(tidesdb_merge_heap_t)); + if (!heap) return NULL; + + heap->capacity = TDB_INITIAL_MERGE_HEAP_CAPACITY; + heap->sources = malloc(heap->capacity * sizeof(tidesdb_merge_source_t *)); + if (!heap->sources) + { + free(heap); + return NULL; + } + + heap->comparator = comparator; + heap->comparator_ctx = comparator_ctx; + + return heap; +} + +/** + * tidesdb_merge_heap_free + * free a merge heap + * @param heap merge heap to free + */ +static void tidesdb_merge_heap_free(tidesdb_merge_heap_t *heap) +{ + if (!heap) return; + + for (int i = 0; i < heap->num_sources; i++) + { + /* we skip freeing cached sources -- they're owned by the iterator */ + if (!heap->sources[i]->is_cached) + { + tidesdb_merge_source_free(heap->sources[i]); + } + } + + free(heap->sources); + free(heap->pop_buf[0]); + free(heap->pop_buf[1]); + free(heap); +} + +/** + * tidesdb_merge_heap_add_source + * add a source to a merge heap + * @param heap merge heap to add source to + * @param source source to add + * @return 0 on success, non-zero on failure + */ +static int tidesdb_merge_heap_add_source(tidesdb_merge_heap_t *heap, tidesdb_merge_source_t *source) +{ + if (heap->num_sources >= heap->capacity) + { + const int new_capacity = heap->capacity * 2; + tidesdb_merge_source_t **new_sources = + realloc(heap->sources, new_capacity * sizeof(tidesdb_merge_source_t *)); + if (!new_sources) return TDB_ERR_MEMORY; + heap->sources = new_sources; + heap->capacity = new_capacity; + } + + heap->sources[heap->num_sources] = source; + heap->num_sources++; + + heap_sift_up(heap, heap->num_sources - 1); + + return TDB_SUCCESS; +} + +/** + * tidesdb_merge_heap_pop + * pop the smallest element from a merge heap + * @param heap merge heap to pop from + * @param corrupted_sst output parameter for corrupted sst (NULL if none) + * @return smallest element + */ +static tidesdb_kv_pair_t *tidesdb_merge_heap_pop(tidesdb_merge_heap_t *heap, + tidesdb_sstable_t **corrupted_sst) +{ + if (corrupted_sst) *corrupted_sst = NULL; + if (heap->num_sources == 0) return NULL; + + tidesdb_merge_source_t *top = heap->sources[0]; + if (!top->current_kv) return NULL; + + /* we transfer ownership of current_kv instead of cloning. + ** advance() starts with kv_pair_free(current_kv) which is a no-op on NULL. + *** eliminates 1 malloc + 1 free + 2 memcpy per pop. + **** for borrowed (inline) kv pairs, we must materialize an owned copy + ***** since the source struct (which contains inline_kv) may be freed below. */ + tidesdb_kv_pair_t *result = top->current_kv; + if (result && (result->entry.flags & TDB_KV_FLAG_BORROWED)) + { + const uint32_t ks = result->entry.key_size; + const uint32_t vs = (result->value) ? result->entry.value_size : 0; + const size_t needed = sizeof(tidesdb_kv_pair_t) + ks + vs; + + /* we use pre-allocated pop buffer when available to avoid malloc. + * the iterator enables this via heap->pop_buf; compaction leaves it NULL. */ + if (heap->pop_buf[0]) + { + const int slot = heap->pop_buf_slot; + if (heap->pop_buf_cap[slot] < needed) + { + const size_t new_cap = (needed > TDB_MERGE_POP_BUF_INITIAL_CAP) + ? needed + : TDB_MERGE_POP_BUF_INITIAL_CAP; + uint8_t *nb = realloc(heap->pop_buf[slot], new_cap); + if (nb) + { + heap->pop_buf[slot] = nb; + heap->pop_buf_cap[slot] = new_cap; + } + } + + if (heap->pop_buf_cap[slot] >= needed) + { + uint8_t *buf = heap->pop_buf[slot]; + tidesdb_kv_pair_t *bkv = (tidesdb_kv_pair_t *)buf; + + bkv->entry = result->entry; + bkv->entry.flags = + (result->entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK) | TDB_KV_FLAG_POP_BUF; + bkv->key = buf + sizeof(tidesdb_kv_pair_t); + memcpy(bkv->key, result->key, ks); + if (vs > 0) + { + bkv->value = bkv->key + ks; + memcpy(bkv->value, result->value, vs); + } + else + { + bkv->value = NULL; + } + result = bkv; + } + else + { + /* realloc failed, we fall back to malloc */ + result = tidesdb_kv_pair_create(result->key, result->entry.key_size, result->value, + result->entry.value_size, result->entry.ttl, + result->entry.seq, + result->entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK); + } + } + else + { + result = tidesdb_kv_pair_create(result->key, result->entry.key_size, result->value, + result->entry.value_size, result->entry.ttl, + result->entry.seq, + result->entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK); + } + } + top->current_kv = NULL; + + const int advance_result = tidesdb_merge_source_advance(top); + if (advance_result != 0) + { + /* the source is exhausted or corrupted */ + if (advance_result == TDB_ERR_CORRUPTION && top->type == MERGE_SOURCE_SSTABLE && + corrupted_sst) + { + /* return corrupted sst for deletion */ + *corrupted_sst = top->source.sstable.sst; + tidesdb_sstable_ref(*corrupted_sst); + } + + /* we remove from heap */ + heap->sources[0] = heap->sources[heap->num_sources - 1]; + heap->num_sources--; + + /* we only free if not cached for reuse */ + if (!top->is_cached) + { + tidesdb_merge_source_free(top); + } + } + + if (heap->num_sources > 1) + { + heap_sift_down(heap, 0); + } + + return result; +} + +/** + * tidesdb_merge_heap_pop_discard + * advance the top source without materializing a popped kv pair. + * + * equivalent to tidesdb_merge_heap_pop followed by tidesdb_kv_pair_free, but + * avoids the pop_buf memcpy for BORROWED sources. used by the tombstone-skip + * loop in tidesdb_iter_find_visible_entry where the popped entry is discarded + * immediately. + * + * returns 0 on success, -1 if the heap was empty. + */ +static int tidesdb_merge_heap_pop_discard(tidesdb_merge_heap_t *heap) +{ + if (heap->num_sources == 0) return -1; + tidesdb_merge_source_t *top = heap->sources[0]; + if (!top->current_kv) return -1; + + /* for ARENA/POP_BUF kvs we must free before the advance overwrites the + * pointer; for BORROWED (inline_kv) the free is a no-op, so we skip both + * the materialization and the free. */ + if (!(top->current_kv->entry.flags & (TDB_KV_FLAG_BORROWED | TDB_KV_FLAG_POP_BUF))) + { + tidesdb_kv_pair_free(top->current_kv); + } + top->current_kv = NULL; + + const int advance_result = tidesdb_merge_source_advance(top); + if (advance_result != 0) + { + heap->sources[0] = heap->sources[heap->num_sources - 1]; + heap->num_sources--; + if (!top->is_cached) tidesdb_merge_source_free(top); + } + + if (heap->num_sources > 1) heap_sift_down(heap, 0); + return 0; +} + +/** + * tidesdb_iter_skip_tombstone_versions + * skip all heap entries whose key equals the just-popped tombstone's key. + * + * used by every tombstone-skip loop in the iterator (next, prev, seek_to_first, + * seek_to_last, find_visible_entry). copies the tombstone key into a stable + * buffer first, because kv->key may point into pop_buf which is reused by + * subsequent heap_pop calls for BORROWED sources. forward direction uses the + * pop_discard variant to avoid the pop_buf memcpy on every skip. + * + * returns TDB_SUCCESS, or TDB_ERR_MEMORY if a very large tombstone key cannot + * be copied to the fallback buffer. + */ +static int tidesdb_iter_skip_tombstone_versions(tidesdb_iter_t *iter, const tidesdb_kv_pair_t *kv, + const int direction) +{ + uint8_t tombstone_key_stack[TDB_PREFIXED_KEY_STACK_MAX]; + uint8_t *tombstone_key; + uint8_t *tombstone_key_heap = NULL; /* set only when we snapshot via malloc */ + const size_t tombstone_key_size = kv->entry.key_size; + + if (direction > 0) + { + /* forward skip pops via pop_discard, which never rewrites the pop_buf slot backing + * kv, so kv->key stays valid for the whole loop -- compare against it directly with + * no copy, and thus no allocation that could fail on a >256-byte tombstone key. + * (the previous unconditional copy made forward iteration able to OOM here and then + * silently surface a stale superseded version.) */ + tombstone_key = (uint8_t *)kv->key; + } + else if (tombstone_key_size <= sizeof(tombstone_key_stack)) + { + memcpy(tombstone_key_stack, kv->key, tombstone_key_size); + tombstone_key = tombstone_key_stack; + } + else + { + /* backward skip pops via pop_max, which rewrites the pop_buf slot backing kv, so the + * key must be snapshotted first. this malloc can still fail for a >256-byte tombstone + * key under memory pressure (rare); callers treat a non-success return as "stop". */ + tombstone_key_heap = malloc(tombstone_key_size); + if (!tombstone_key_heap) return TDB_ERR_MEMORY; + memcpy(tombstone_key_heap, kv->key, tombstone_key_size); + tombstone_key = tombstone_key_heap; + } + + while (!tidesdb_merge_heap_empty(iter->heap)) + { + tidesdb_kv_pair_t *peek = iter->heap->sources[0]->current_kv; + if (!peek) break; + + const int cmp = iter->heap->comparator(peek->key, peek->entry.key_size, tombstone_key, + tombstone_key_size, iter->heap->comparator_ctx); + if (cmp != 0) break; + + if (direction > 0) + { + tidesdb_merge_heap_pop_discard(iter->heap); + } + else + { + tidesdb_kv_pair_t *dup = tidesdb_merge_heap_pop_max(iter->heap); + tidesdb_kv_pair_free(dup); + } + } + + free(tombstone_key_heap); /* NULL-safe; only the backward large-key path allocates */ + return TDB_SUCCESS; +} + +/** + * tidesdb_merge_heap_empty + * check if a merge heap is empty + * @param heap merge heap to check + * @return 1 if empty, 0 otherwise + */ +static int tidesdb_merge_heap_empty(const tidesdb_merge_heap_t *heap) +{ + return heap->num_sources == 0; +} + +/** + * tidesdb_memtable_source_set_inline_borrowed + * populate source->inline_kv with borrowed pointers into the skip-list node + * and set source->current_kv = &inline_kv with TDB_KV_FLAG_BORROWED. + * + * this avoids the per-advance malloc+memcpy+free of tidesdb_kv_pair_create + * on the memtable read path. heap_pop materializes a stable owned copy into + * pop_buf when the caller keeps the kv; tombstone-skip discards in + * tidesdb_iter_find_visible_entry free (no-op on borrowed) without a copy. + * + * key/value pointers are stable while the cursor holds this position, which + * is the same invariant the sstable inline_kv path already relies on. the + * iterator pins the memtable (active via try_ref, immutable via refcount) + * for its lifetime, so the node memory is not reclaimed under us. + */ +static inline void tidesdb_memtable_source_set_inline_borrowed(tidesdb_merge_source_t *source, + const uint8_t *key, size_t key_size, + const uint8_t *value, + size_t value_size, int64_t ttl, + uint64_t seq, uint8_t sl_flags) +{ + tidesdb_kv_pair_t *ikv = &source->inline_kv; + ikv->entry.flags = tidesdb_sl_flags_to_kv_flags(sl_flags) | TDB_KV_FLAG_BORROWED; + ikv->entry.key_size = (uint32_t)key_size; + ikv->entry.value_size = (uint32_t)value_size; + ikv->entry.ttl = ttl; + ikv->entry.seq = seq; + ikv->entry.vlog_offset = 0; + ikv->key = (uint8_t *)key; + ikv->value = (value_size > 0) ? (uint8_t *)value : NULL; + source->current_kv = ikv; +} + +/** + * tidesdb_merge_source_from_memtable + * create a merge source from a memtable + * @param memtable memtable to create merge source from + * @param config column family config + * @param imm immutable memtable wrapper (NULL for active memtable) + * @return merge source + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_memtable( + skip_list_t *memtable, tidesdb_column_family_config_t *config, + tidesdb_immutable_memtable_t *imm) +{ + tidesdb_merge_source_t *source = calloc(1, sizeof(tidesdb_merge_source_t)); + if (!source) return NULL; + + source->type = MERGE_SOURCE_MEMTABLE; + source->config = config; + source->source.memtable.imm = imm; + source->is_cached = 0; /* memtable sources are not cached */ + + if (imm) + { + tidesdb_immutable_memtable_ref(imm); + } + + if (skip_list_cursor_init(&source->source.memtable.cursor, memtable) != 0) + { + if (imm) tidesdb_immutable_memtable_unref(imm); + free(source); + return NULL; + } + + const int goto_result = skip_list_cursor_goto_first(source->source.memtable.cursor); + + if (goto_result == 0) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(source->source.memtable.cursor, &key, &key_size, &value, + &value_size, &ttl, &deleted, &seq) == 0) + { + tidesdb_memtable_source_set_inline_borrowed(source, key, key_size, value, value_size, + ttl, seq, deleted); + } + } + + return source; +} + +/** + * tidesdb_unified_source_advance_to_cf + * advance a unified memtable cursor to the next entry matching the CF prefix. + * skips entries belonging to other CFs. returns 1 if a matching entry was found. + */ +static int tidesdb_unified_source_advance_to_cf(tidesdb_merge_source_t *source, const int forward) +{ + skip_list_cursor_t *cursor = source->source.unified.cursor; + const uint8_t *prefix = source->source.unified.prefix; + + while (1) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &key, &key_size, &value, &value_size, &ttl, + &deleted, &seq) != 0) + { + return 0; + } + + /* we check if key starts with our CF prefix */ + if (key_size >= TDB_UNIFIED_CF_PREFIX_SIZE && + memcmp(key, prefix, TDB_UNIFIED_CF_PREFIX_SIZE) == 0) + { + /* we strip the prefix by borrowing a pointer past it -- no copy */ + const uint8_t *real_key = key + TDB_UNIFIED_CF_PREFIX_SIZE; + const size_t real_key_size = key_size - TDB_UNIFIED_CF_PREFIX_SIZE; + tidesdb_memtable_source_set_inline_borrowed(source, real_key, real_key_size, value, + value_size, ttl, seq, deleted); + return 1; + } + + /* if key prefix > our prefix and we are going forward, no more entries for this CF */ + if (forward && key_size >= TDB_UNIFIED_CF_PREFIX_SIZE && + memcmp(key, prefix, TDB_UNIFIED_CF_PREFIX_SIZE) > 0) + { + return 0; + } + + /* if key prefix < our prefix and we are going backward, no more entries for this CF */ + if (!forward && key_size >= TDB_UNIFIED_CF_PREFIX_SIZE && + memcmp(key, prefix, TDB_UNIFIED_CF_PREFIX_SIZE) < 0) + { + return 0; + } + + /* we advance cursor past this non-matching entry */ + int rc = forward ? skip_list_cursor_next(cursor) : skip_list_cursor_prev(cursor); + if (rc != 0) return 0; + } +} + +/** + * tidesdb_merge_source_from_unified_memtable + * create a merge source from a unified memtable filtered to a specific CF. + * keys in the unified skip list are prefixed with 4-byte BE CF index. + * this source seeks to the CF's key range and strips the prefix on output. + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_unified_memtable( + skip_list_t *memtable, tidesdb_column_family_config_t *config, + tidesdb_immutable_memtable_t *imm, uint32_t cf_index) +{ + tidesdb_merge_source_t *source = calloc(1, sizeof(tidesdb_merge_source_t)); + if (!source) return NULL; + + source->type = MERGE_SOURCE_UNIFIED_MEMTABLE; + source->config = config; + source->source.unified.imm = imm; + source->source.unified.cf_index = cf_index; + tdb_encode_be32(cf_index, source->source.unified.prefix); + source->is_cached = 0; + + if (imm) + { + tidesdb_immutable_memtable_ref(imm); + } + + if (skip_list_cursor_init(&source->source.unified.cursor, memtable) != 0) + { + if (imm) tidesdb_immutable_memtable_unref(imm); + free(source); + return NULL; + } + + /*** we seek to the start of this CF's key range. + ** seek_ge lands on the first key >= the CF prefix and is robust to a concurrent put + * splicing a sub-target node into forward[0]; advance_to_cf then filters to our CF. */ + if (skip_list_cursor_seek_ge(source->source.unified.cursor, source->source.unified.prefix, + TDB_UNIFIED_CF_PREFIX_SIZE) == 0) + { + tidesdb_unified_source_advance_to_cf(source, 1); + } + + return source; +} + +/** + * tidesdb_txn_ops_sort_ctx_t + * context for qsort_r comparator when sorting transaction ops indices + * @param ops pointer to the transaction ops array + * @param comparator key comparator function + * @param comparator_ctx comparator context + */ +typedef struct +{ + tidesdb_txn_op_t *ops; + skip_list_comparator_fn comparator; + void *comparator_ctx; +} tidesdb_txn_ops_sort_ctx_t; + +/* thread-local context for qsort comparator (cross-platform alternative to qsort_r) */ +static _Thread_local const tidesdb_txn_ops_sort_ctx_t *tidesdb_txn_ops_sort_ctx_tls = NULL; + +/** + * tidesdb_txn_ops_index_cmp + * qsort comparator that orders two indices into the txn ops array by key + * uses thread-local context for cross-platform compatibility + * @param a pointer to first index + * @param b pointer to second index + * @return <0 if a < b, 0 if equal, >0 if a > b + */ +static int tidesdb_txn_ops_index_cmp(const void *a, const void *b) +{ + const int ia = *(const int *)a; + const int ib = *(const int *)b; + const tidesdb_txn_ops_sort_ctx_t *c = tidesdb_txn_ops_sort_ctx_tls; + + return c->comparator(c->ops[ia].key, c->ops[ia].key_size, c->ops[ib].key, c->ops[ib].key_size, + c->comparator_ctx); +} + +/** + * tidesdb_merge_source_from_txn_ops + * create a merge source from transaction pending writes for read-your-own-writes + * + * filters txn->ops for the target column family, deduplicates (last write per + * key wins by scanning in reverse), sorts by key using the cf comparator, and + * positions at the first entry. + * + * entries use seq=UINT64_MAX so they always win over committed data with the + * same key in the merge heap. + * + * @param txn transaction handle + * @param cf column family to filter for + * @param config column family configuration + * @return merge source or NULL if no ops for this cf (or on error) + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_txn_ops( + tidesdb_txn_t *txn, tidesdb_column_family_t *cf, tidesdb_column_family_config_t *config) +{ + if (!txn || !cf || txn->num_ops == 0) return NULL; + + /* we resolve the comparator for this column family */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + if (!comparator_fn) comparator_fn = skip_list_comparator_memcmp; + + /* we collect indices of ops belonging to this CF + * we scan in reverse so the first occurrence of each key is the newest write */ + int *candidate_indices = malloc(txn->num_ops * sizeof(int)); + if (!candidate_indices) return NULL; + + int candidate_count = 0; + + /* we use a simple seen-set to deduplicate + * for each key we only keep the latest (highest index) op */ + for (int i = txn->num_ops - 1; i >= 0; i--) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + + /* quick CF check (pointer comparison) */ + if (op->cf != cf) continue; + + /* we check if we already have a newer op for this key */ + int already_seen = 0; + for (int j = 0; j < candidate_count; j++) + { + const tidesdb_txn_op_t *existing = &txn->ops[candidate_indices[j]]; + if (existing->key_size == op->key_size && + comparator_fn(existing->key, existing->key_size, op->key, op->key_size, + comparator_ctx) == 0) + { + already_seen = 1; + break; + } + } + + if (!already_seen) + { + candidate_indices[candidate_count++] = i; + } + } + + if (candidate_count == 0) + { + free(candidate_indices); + return NULL; + } + + /* we shrink to actual size */ + int *sorted_indices = realloc(candidate_indices, candidate_count * sizeof(int)); + if (!sorted_indices) + sorted_indices = candidate_indices; /* realloc shrink cant fail, but safe */ + + /* we sort by key using the column family comparator */ + tidesdb_txn_ops_sort_ctx_t sort_ctx = { + .ops = txn->ops, .comparator = comparator_fn, .comparator_ctx = comparator_ctx}; + + tidesdb_txn_ops_sort_ctx_tls = &sort_ctx; + qsort(sorted_indices, candidate_count, sizeof(int), tidesdb_txn_ops_index_cmp); + tidesdb_txn_ops_sort_ctx_tls = NULL; + + /* we create the merge source */ + tidesdb_merge_source_t *source = calloc(1, sizeof(tidesdb_merge_source_t)); + if (!source) + { + free(sorted_indices); + return NULL; + } + + source->type = MERGE_SOURCE_TXN_OPS; + source->config = config; + source->is_cached = 0; + source->source.txn_ops.txn = txn; + source->source.txn_ops.cf = cf; + source->source.txn_ops.sorted_indices = sorted_indices; + source->source.txn_ops.count = candidate_count; + source->source.txn_ops.pos = 0; + + /* we set current_kv from the first sorted entry */ + const tidesdb_txn_op_t *first_op = &txn->ops[sorted_indices[0]]; + source->current_kv = tidesdb_kv_pair_create(first_op->key, first_op->key_size, first_op->value, + first_op->value_size, first_op->ttl, UINT64_MAX, + tidesdb_txn_op_kv_flags(first_op)); + + return source; +} + +/** + * tidesdb_merge_source_from_sstable_klog + * create a merge source from a klog-based sstable + * @param db database instance + * @param sst sstable + * @return merge source or NULL on error + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_sstable_klog(tidesdb_t *db, + tidesdb_sstable_t *sst) +{ + tidesdb_merge_source_t *source = malloc(sizeof(tidesdb_merge_source_t)); + if (!source) return NULL; + + source->type = MERGE_SOURCE_SSTABLE; + source->source.sstable.sst = sst; + source->source.sstable.db = db; /* store db for later vlog reads */ + source->is_cached = 0; /* will be set to 1 if cached by iterator */ + + tidesdb_sstable_ref(sst); + + /* scan sources open the klog only; the vlog is opened on demand by + * tidesdb_vlog_read_value when a value misses the inline klog payload */ + if (tidesdb_sstable_ensure_klog_open(db, sst) != 0) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + if (block_manager_cursor_init(&source->source.sstable.klog_cursor, bms.klog_bm) != 0) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + /* the klog source reads values via tidesdb_vlog_read_value (sst->vlog_bm), never via a + * source-held vlog cursor; leave it NULL so cleanup's cursor_free is a no-op */ + source->source.sstable.vlog_cursor = NULL; + + /* we hint to OS that this is streaming read (data will be accessed only once) + * this helps prevent cache pollution during compaction * * */ + set_file_noreuse_hint(bms.klog_bm->fd, 0, 0); + + source->source.sstable.current_block_data = NULL; /* no block data yet */ + source->source.sstable.current_rc_block = NULL; /* no ref-counted block yet */ + source->source.sstable.decompressed_data = NULL; /* no decompressed data yet */ + source->source.sstable.cache_pin = NULL; /* no cache pin yet */ + memset(source->source.sstable.block_stash, 0, sizeof(source->source.sstable.block_stash)); + memset(&source->source.sstable.lazy, 0, sizeof(source->source.sstable.lazy)); + source->source.sstable.current_block = NULL; /* no current block yet */ + source->current_kv = NULL; /* no current kv yet */ + source->config = sst->config; + + /* we only read data blocks, not the metadata block at the end */ + if (sst->num_klog_blocks == 0) + { + /* empty sstable, no data blocks to read */ + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + if (block_manager_cursor_goto_first(source->source.sstable.klog_cursor) == 0) + { + /* we check cursor is within data region (before index/bloom/metadata blocks) */ + if (sst->klog_data_end_offset > 0 && + source->source.sstable.klog_cursor->current_pos >= sst->klog_data_end_offset) + { + /* cursor is at or past data end offset */ + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + block_manager_block_t *block = + tidesdb_read_block(db, sst, source->source.sstable.klog_cursor); + if (!block) + { + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + const uint8_t *data = block->data; + const size_t data_size = block->size; + + tidesdb_klog_block_t *klog_block = NULL; + if (tidesdb_klog_block_deserialize(data, data_size, &klog_block, 0) != 0) + { + block_manager_block_release(block); + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + if (klog_block && klog_block->num_entries > 0) + { + source->source.sstable.current_block = klog_block; + source->source.sstable.current_block_data = block; + source->source.sstable.current_entry_idx = 0; + + const uint8_t *value = klog_block->inline_values[0]; + uint8_t *vlog_value = NULL; + if (klog_block->entries[0].vlog_offset > 0) + { + tidesdb_vlog_read_value(source->source.sstable.db, sst, + klog_block->entries[0].vlog_offset, + klog_block->entries[0].value_size, &vlog_value); + value = vlog_value; + } + + source->current_kv = + tidesdb_kv_pair_create(klog_block->keys[0], klog_block->entries[0].key_size, value, + klog_block->entries[0].value_size, + klog_block->entries[0].ttl, klog_block->entries[0].seq, + klog_block->entries[0].flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + + if (!source->current_kv) + { + tidesdb_klog_block_free(klog_block); + block_manager_block_release(block); + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + return source; + } + + if (klog_block) tidesdb_klog_block_free(klog_block); + if (block) block_manager_block_release(block); + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + /* cursor_goto_first failed, we clean up and return NULL */ + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; +} + +/** + * tidesdb_btree_read_vlog_value + * read and decompress a value from the vlog via a btree vlog cursor. + * handles the cursor_goto + cursor_read + decompression sequence that + * is shared across all btree vlog read sites (seek, advance, point lookup). + * @param vlog_cursor block manager cursor positioned on the vlog file + * @param vlog_offset byte offset of the vlog block + * @param config column family config (for compression algorithm) + * @param value_out receives the (decompressed) value data (caller must free) + * @param value_size_out receives the value size + * @return 0 on success, -1 on failure + */ +static int tidesdb_btree_read_vlog_value(block_manager_cursor_t *vlog_cursor, + const uint64_t vlog_offset, + const tidesdb_column_family_config_t *config, + uint8_t **value_out, size_t *value_size_out, + const size_t expected_value_size) +{ + block_manager_cursor_goto(vlog_cursor, vlog_offset); + block_manager_block_t *vlog_block = block_manager_cursor_read(vlog_cursor); + if (!vlog_block) return -1; + + const uint8_t *data = vlog_block->data; + const size_t data_size = vlog_block->size; + + /* we decompress if the column family uses compression */ + if (config && config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + uint8_t *decompressed = + decompress_data(data, data_size, &decompressed_size, config->compression_algorithm); + block_manager_block_free(vlog_block); + if (!decompressed) return -1; + + /* verify the produced size matches the klog entry's recorded value_size, mirroring + * tidesdb_vlog_read_value -- a truncated/corrupt vlog block must not silently return + * surviving bytes (expected_value_size == 0 means the caller opts out of the check) */ + if (expected_value_size != 0 && decompressed_size != expected_value_size) + { + free(decompressed); + return -1; + } + + *value_out = decompressed; + *value_size_out = decompressed_size; + return 0; + } + + /* uncompressed, we copy raw block data -- same size verification as the compressed path */ + if (expected_value_size != 0 && data_size != expected_value_size) + { + block_manager_block_free(vlog_block); + return -1; + } + uint8_t *copy = malloc(data_size); + if (!copy) + { + block_manager_block_free(vlog_block); + return -1; + } + memcpy(copy, data, data_size); + *value_out = copy; + *value_size_out = data_size; + block_manager_block_free(vlog_block); + return 0; +} + +/** + * tidesdb_merge_source_from_btree + * create a merge source from a btree-based sstable + * @param db database instance + * @param sst sstable with btree index + * @return merge source or NULL on error + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_btree(tidesdb_t *db, + tidesdb_sstable_t *sst) +{ + tidesdb_merge_source_t *source = malloc(sizeof(tidesdb_merge_source_t)); + if (!source) return NULL; + + source->type = MERGE_SOURCE_BTREE; + source->source.btree.sst = sst; + source->source.btree.db = db; + source->is_cached = 0; + + tidesdb_sstable_ref(sst); + + if (tidesdb_sstable_ensure_open(db, sst) != 0) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + /* resolve comparator */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(db, sst->config, &comparator_fn, &comparator_ctx); + + /* we create btree handle */ + btree_t *tree = malloc(sizeof(btree_t)); + if (!tree) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + memset(tree, 0, sizeof(btree_t)); + tree->bm = bms.klog_bm; + tree->root_offset = sst->btree_root_offset; + tree->first_leaf_offset = sst->btree_first_leaf; + tree->last_leaf_offset = sst->btree_last_leaf; + tree->config.target_node_size = BTREE_DEFAULT_NODE_SIZE; + tree->config.value_threshold = sst->config->klog_value_threshold; + tree->config.comparator = (btree_comparator_fn)comparator_fn; + tree->config.comparator_ctx = comparator_ctx; + tree->config.cmp_type = comparator_fn ? BTREE_CMP_CUSTOM : BTREE_CMP_MEMCMP; + tree->config.compression_algo = sst->config->compression_algorithm; + tree->node_cache = db->btree_node_cache; + tree->cache_key_prefix = sst->cache_key_prefix; + + btree_cursor_t *cursor = NULL; + if (btree_cursor_init(&cursor, tree) != 0) + { + free(tree); + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + if (btree_cursor_goto_first(cursor) != 0) + { + btree_cursor_free(cursor); + free(tree); + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + source->source.btree.cursor = cursor; + + /* we init vlog cursor */ + if (block_manager_cursor_init(&source->source.btree.vlog_cursor, bms.vlog_bm) != 0) + { + btree_cursor_free(cursor); + free(tree); + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + source->current_kv = NULL; + source->config = sst->config; + + /* we get first entry */ + uint8_t *key = NULL, *value = NULL; + size_t key_size = 0, value_size = 0; + uint64_t vlog_offset = 0, seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (btree_cursor_get(cursor, &key, &key_size, &value, &value_size, &vlog_offset, &seq, &ttl, + &deleted) != 0) + { + block_manager_cursor_free(source->source.btree.vlog_cursor); + btree_cursor_free(cursor); + free(tree); + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + const uint8_t *actual_value = value; + size_t actual_value_size = value_size; + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_btree_read_vlog_value(source->source.btree.vlog_cursor, vlog_offset, + source->config, &vlog_value, &actual_value_size, + value_size) == 0) + { + actual_value = vlog_value; + } + else + { + actual_value = NULL; + actual_value_size = 0; + } + } + + source->current_kv = + tidesdb_kv_pair_create(key, key_size, actual_value, actual_value_size, ttl, seq, deleted); + free(vlog_value); + + if (!source->current_kv) + { + block_manager_cursor_free(source->source.btree.vlog_cursor); + btree_cursor_free(cursor); + free(tree); + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + return source; +} + +/** + * tidesdb_merge_source_from_sstable + * create a merge source from an sstable (branches based on use_btree flag) + * @param db database instance + * @param sst sstable + * @return merge source or NULL on error + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_sstable(tidesdb_t *db, + tidesdb_sstable_t *sst) +{ + /* we use sst->use_btree which is set from metadata, not config */ + if (sst->use_btree) + { + return tidesdb_merge_source_from_btree(db, sst); + } + return tidesdb_merge_source_from_sstable_klog(db, sst); +} + +/** + * tidesdb_merge_source_from_sstable_lazy + * creates an SST merge source without reading the first block from disk. + * the source starts with current_kv=NULL; the first seek() call will + * read blocks on demand. only used by the iterator path + * since compaction needs the initial block read. + * @param db database instance + * @param sst sstable + * @return merge source or NULL on error + */ +static tidesdb_merge_source_t *tidesdb_merge_source_from_sstable_lazy(tidesdb_t *db, + tidesdb_sstable_t *sst) +{ + if (sst->use_btree) + { + return tidesdb_merge_source_from_btree(db, sst); + } + + tidesdb_merge_source_t *source = malloc(sizeof(tidesdb_merge_source_t)); + if (!source) return NULL; + + source->type = MERGE_SOURCE_SSTABLE; + source->source.sstable.sst = sst; + source->source.sstable.db = db; + source->is_cached = 0; + + tidesdb_sstable_ref(sst); + + /* scan sources open the klog only; the vlog is opened on demand by + * tidesdb_vlog_read_value when a value misses the inline klog payload */ + if (tidesdb_sstable_ensure_klog_open(db, sst) != 0) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) != TDB_SUCCESS) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + if (block_manager_cursor_init(&source->source.sstable.klog_cursor, bms.klog_bm) != 0) + { + tidesdb_sstable_unref(db, sst); + free(source); + return NULL; + } + + /* the klog source reads values via tidesdb_vlog_read_value (sst->vlog_bm), never via a + * source-held vlog cursor; leave it NULL so cleanup's cursor_free is a no-op */ + source->source.sstable.vlog_cursor = NULL; + + source->source.sstable.current_block_data = NULL; + source->source.sstable.current_rc_block = NULL; + source->source.sstable.decompressed_data = NULL; + source->source.sstable.cache_pin = NULL; + memset(source->source.sstable.block_stash, 0, sizeof(source->source.sstable.block_stash)); + memset(&source->source.sstable.lazy, 0, sizeof(source->source.sstable.lazy)); + source->source.sstable.current_block = NULL; + source->current_kv = NULL; /* lazy, no initial block read */ + source->config = sst->config; + + if (sst->num_klog_blocks == 0) + { + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + /* we position cursor at first data block but don't read it */ + if (block_manager_cursor_goto_first(source->source.sstable.klog_cursor) != 0 || + (sst->klog_data_end_offset > 0 && + source->source.sstable.klog_cursor->current_pos >= sst->klog_data_end_offset)) + { + tidesdb_sstable_unref(db, sst); + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + free(source); + return NULL; + } + + return source; +} + +/** + * tidesdb_merge_source_free + * free a merge source + * @param source merge source to free + */ +static void tidesdb_merge_source_free(tidesdb_merge_source_t *source) +{ + if (!source) return; + + if (source->type == MERGE_SOURCE_MEMTABLE) + { + skip_list_cursor_free(source->source.memtable.cursor); + if (source->source.memtable.imm) + { + tidesdb_immutable_memtable_unref(source->source.memtable.imm); + } + } + else if (source->type == MERGE_SOURCE_BTREE) + { + if (source->source.btree.cursor) + { + btree_t *tree = source->source.btree.cursor->tree; + btree_cursor_free(source->source.btree.cursor); + free(tree); + } + block_manager_cursor_free(source->source.btree.vlog_cursor); + tidesdb_sstable_unref(NULL, source->source.btree.sst); + } + else if (source->type == MERGE_SOURCE_TXN_OPS) + { + /* we only free the sorted index array + * txn and cf are borrowed pointers, not owned */ + free(source->source.txn_ops.sorted_indices); + } + else if (source->type == MERGE_SOURCE_UNIFIED_MEMTABLE) + { + skip_list_cursor_free(source->source.unified.cursor); + if (source->source.unified.imm) + { + tidesdb_immutable_memtable_unref(source->source.unified.imm); + } + } + else + { + if (source->source.sstable.current_rc_block) + { + tidesdb_block_release(source->source.sstable.current_rc_block); + } + else if (source->source.sstable.current_block) + { + tidesdb_klog_block_free(source->source.sstable.current_block); + } + if (source->source.sstable.cache_pin) + { + clock_cache_release(source->source.sstable.cache_pin); + } + tidesdb_iter_clear_block_stash(source); + tidesdb_iter_clear_lazy(source); + if (source->source.sstable.decompressed_data) + { + free(source->source.sstable.decompressed_data); + } + if (source->source.sstable.current_block_data) + { + block_manager_block_release(source->source.sstable.current_block_data); + } + block_manager_cursor_free(source->source.sstable.klog_cursor); + block_manager_cursor_free(source->source.sstable.vlog_cursor); + tidesdb_sstable_unref(NULL, source->source.sstable.sst); + } + + tidesdb_kv_pair_free(source->current_kv); + free(source); +} + +/** + * tidesdb_merge_source_advance + * advance a merge source + * @param source merge source to advance + * @return 0 on success, -1 on failure + */ +static int tidesdb_merge_source_advance(tidesdb_merge_source_t *source) +{ + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (source->type == MERGE_SOURCE_MEMTABLE) + { + /* walk the version chain on the current node before moving to the next key + * so mvcc readers can fall back to an older visible version when the newest + * one is filtered out by snapshot_seq */ + if (skip_list_cursor_advance_in_node(source->source.memtable.cursor) == 0 || + skip_list_cursor_next(source->source.memtable.cursor) == 0) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(source->source.memtable.cursor, &key, &key_size, + &value, &value_size, &ttl, &deleted, &seq) == 0) + { + tidesdb_memtable_source_set_inline_borrowed(source, key, key_size, value, + value_size, ttl, seq, deleted); + return TDB_SUCCESS; + } + } + } + else if (source->type == MERGE_SOURCE_UNIFIED_MEMTABLE) + { + /* in-node version chain stays on the same key so the CF prefix still matches. + * once the chain is exhausted, fall back to advancing to the next key with the + * cf-prefix filter */ + if (skip_list_cursor_advance_in_node(source->source.unified.cursor) == 0) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + if (skip_list_cursor_get_with_seq(source->source.unified.cursor, &key, &key_size, + &value, &value_size, &ttl, &deleted, &seq) == 0) + { + const uint8_t *real_key = key + TDB_UNIFIED_CF_PREFIX_SIZE; + const size_t real_key_size = key_size - TDB_UNIFIED_CF_PREFIX_SIZE; + tidesdb_memtable_source_set_inline_borrowed(source, real_key, real_key_size, value, + value_size, ttl, seq, deleted); + return TDB_SUCCESS; + } + } + if (skip_list_cursor_next(source->source.unified.cursor) == 0) + { + if (tidesdb_unified_source_advance_to_cf(source, 1)) + { + return TDB_SUCCESS; + } + } + } + else if (source->type == MERGE_SOURCE_BTREE) + { + if (btree_cursor_next(source->source.btree.cursor) == 0) + { + uint8_t *key = NULL, *value = NULL; + size_t key_size = 0, value_size = 0; + uint64_t vlog_offset = 0, seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (btree_cursor_get(source->source.btree.cursor, &key, &key_size, &value, &value_size, + &vlog_offset, &seq, &ttl, &deleted) == 0) + { + const uint8_t *actual_value = value; + size_t actual_value_size = value_size; + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_btree_read_vlog_value(source->source.btree.vlog_cursor, vlog_offset, + source->config, &vlog_value, + &actual_value_size, value_size) == 0) + { + actual_value = vlog_value; + } + else + { + /* surface the silent data-integrity event, a failed vlog read here + * writes an empty value into the merge output. with F6 this also + * fires on a value-size mismatch. */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "merge btree vlog read failed (offset=%" PRIu64 + "), value treated as empty in merged output", + vlog_offset); + actual_value = NULL; + actual_value_size = 0; + } + } + + source->current_kv = tidesdb_kv_pair_create(key, key_size, actual_value, + actual_value_size, ttl, seq, deleted); + free(vlog_value); + return TDB_SUCCESS; + } + } + } + else if (source->type == MERGE_SOURCE_TXN_OPS) + { + /* we advance to next entry in sorted txn ops index */ + source->source.txn_ops.pos++; + if (source->source.txn_ops.pos < source->source.txn_ops.count) + { + const int op_idx = source->source.txn_ops.sorted_indices[source->source.txn_ops.pos]; + const tidesdb_txn_op_t *op = &source->source.txn_ops.txn->ops[op_idx]; + + source->current_kv = + tidesdb_kv_pair_create(op->key, op->key_size, op->value, op->value_size, op->ttl, + UINT64_MAX, tidesdb_txn_op_kv_flags(op)); + return TDB_SUCCESS; + } + return TDB_ERR_NOT_FOUND; + } + else + { + /* if we have a lazy (not-yet-deserialized) block with an index, + * parse only the next entry from raw data instead of deserializing + * the entire block. this replaces the O(N) full deserialization + * with O(1) per-entry parsing using the pre-built index. */ + if (source->source.sstable.lazy.data && !source->source.sstable.current_block) + { + /* we parse one entry at a time from raw bytes */ + if (source->source.sstable.lazy.idx_count > 0 && source->source.sstable.lazy.idx_base) + { + source->source.sstable.lazy.entry_idx++; + const int idx = source->source.sstable.lazy.entry_idx; + const int count = (int)source->source.sstable.lazy.idx_count; + + if (idx < count) + { + /* we parse single entry from index */ + const uint8_t *idx_base = source->source.sstable.lazy.idx_base; + const uint8_t *bdata = source->source.sstable.lazy.block_data; + const size_t bdata_size = source->source.sstable.lazy.block_data_size; + const uint8_t *fie = idx_base + idx * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t e_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_ENTRY_OFF); + const uint32_t mk_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t mk_sz = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_SIZE); + const uint32_t sq_lo = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t sq_hi = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_HI); + + /* validate the index-sourced offsets against the block before use; a + * malformed entry falls through to clear-lazy + advance (skips the block) */ + if (e_off < bdata_size && mk_off <= bdata_size && mk_sz <= bdata_size - mk_off) + { + const uint8_t *eptr = bdata + e_off; + size_t erem = bdata_size - e_off; + const uint8_t flags = *eptr++; + erem--; + + uint64_t ks, vs; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + + /* we skip seq varint to reach ttl/vlog */ + uint64_t seq_skip; + br = decode_varint(eptr, &seq_skip, (int)erem); + eptr += br; + erem -= br; + + int64_t ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (erem >= sizeof(int64_t)) + { + ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + } + + uint64_t vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vo; + decode_varint(eptr, &vo, (int)erem); + vlog_offset = vo; + } + + const uint64_t abs_seq = ((uint64_t)sq_hi << TDB_U64_HI_LO_SHIFT) | sq_lo; + const uint8_t *key_ptr = bdata + mk_off; + + if (vlog_offset > 0) + { + uint8_t *vlog_value = NULL; + tidesdb_vlog_read_value(source->source.sstable.db, + source->source.sstable.sst, vlog_offset, + (size_t)vs, &vlog_value); + source->current_kv = + tidesdb_kv_pair_create(key_ptr, mk_sz, vlog_value, (size_t)vs, ttl, + abs_seq, flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + } + else + { + /* the inline value sits at [mk_off+mk_sz, +vs); if vs runs past the + * block (malformed) treat it as empty rather than over-read */ + const int val_ok = vs <= bdata_size - mk_off - mk_sz; + tidesdb_kv_pair_t *ikv = &source->inline_kv; + ikv->entry.flags = + (flags & TDB_KV_TOMBSTONE_FLAG_MASK) | TDB_KV_FLAG_BORROWED; + ikv->entry.key_size = mk_sz; + ikv->entry.value_size = val_ok ? (uint32_t)vs : 0; + ikv->entry.seq = abs_seq; + ikv->entry.ttl = ttl; + ikv->entry.vlog_offset = 0; + ikv->key = (uint8_t *)key_ptr; + ikv->value = + (vs > 0 && val_ok) ? (uint8_t *)(bdata + mk_off + mk_sz) : NULL; + source->current_kv = ikv; + } + + /* we prefetch next block when we reach the last entry */ + if (idx + 1 >= count) + { + const tidesdb_sstable_t *sst = source->source.sstable.sst; + block_manager_cursor_t *kc = source->source.sstable.klog_cursor; + if (sst->klog_bm && kc && + (sst->klog_data_end_offset == 0 || + kc->current_pos < sst->klog_data_end_offset)) + { + prefetch_file_region(sst->klog_bm->fd, (off_t)kc->current_pos, + (off_t)TDB_KLOG_BLOCK_SIZE); + } + } + + return TDB_SUCCESS; + } + } + + /* exhausted indexed lazy block, we clear and fall through to next block */ + tidesdb_iter_clear_lazy(source); + goto advance_next_block; + } + + /* non-indexed lazy block, we fall back to full deserialization */ + const uint8_t *deser_ptr = source->source.sstable.lazy.block_data; + size_t deser_size = source->source.sstable.lazy.block_data_size; + + tidesdb_klog_block_t *kb = NULL; + if (tidesdb_klog_block_deserialize(deser_ptr, deser_size, &kb, 1) == 0 && kb) + { + kb->data_ref = NULL; + source->source.sstable.current_block = kb; + source->source.sstable.cache_pin = source->source.sstable.lazy.pin; + source->source.sstable.lazy.pin = NULL; + source->source.sstable.current_block_data = source->source.sstable.lazy.bmblock; + source->source.sstable.lazy.bmblock = NULL; + source->source.sstable.decompressed_data = source->source.sstable.lazy.decompressed; + source->source.sstable.lazy.decompressed = NULL; + tidesdb_iter_clear_lazy(source); + } + else + { + tidesdb_iter_clear_lazy(source); + return TDB_ERR_CORRUPTION; + } + } + + /* we advance to next entry in current block or next block */ + source->source.sstable.current_entry_idx++; + + const tidesdb_klog_block_t *kb = source->source.sstable.current_block; + if (kb && (uint32_t)source->source.sstable.current_entry_idx < kb->num_entries) + { + const int idx = source->source.sstable.current_entry_idx; + const tidesdb_klog_entry_t *e = &kb->entries[idx]; + + if (e->vlog_offset > 0) + { + uint8_t *vlog_value = NULL; + tidesdb_vlog_read_value(source->source.sstable.db, source->source.sstable.sst, + e->vlog_offset, e->value_size, &vlog_value); + source->current_kv = + tidesdb_kv_pair_create(kb->keys[idx], e->key_size, vlog_value, e->value_size, + e->ttl, e->seq, e->flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + } + else + { + tidesdb_kv_pair_t *ikv = &source->inline_kv; + ikv->entry = *e; + ikv->entry.flags = (e->flags & TDB_KV_TOMBSTONE_FLAG_MASK) | TDB_KV_FLAG_BORROWED; + ikv->key = kb->keys[idx]; + ikv->value = (uint8_t *)kb->inline_values[idx]; + source->current_kv = ikv; + } + + if ((uint32_t)(idx + 1) >= kb->num_entries) + { + const tidesdb_sstable_t *sst = source->source.sstable.sst; + block_manager_cursor_t *kc = source->source.sstable.klog_cursor; + if (sst->klog_bm && kc && + (sst->klog_data_end_offset == 0 || kc->current_pos < sst->klog_data_end_offset)) + { + prefetch_file_region(sst->klog_bm->fd, (off_t)kc->current_pos, + (off_t)TDB_KLOG_BLOCK_SIZE); + } + } + + return TDB_SUCCESS; + } + + advance_next_block: + if (source->source.sstable.current_rc_block) + { + tidesdb_block_release(source->source.sstable.current_rc_block); + source->source.sstable.current_rc_block = NULL; + } + else if (source->source.sstable.current_block) + { + tidesdb_klog_block_free(source->source.sstable.current_block); + } + source->source.sstable.current_block = NULL; + if (source->source.sstable.cache_pin) + { + clock_cache_release(source->source.sstable.cache_pin); + source->source.sstable.cache_pin = NULL; + } + if (source->source.sstable.decompressed_data) + { + free(source->source.sstable.decompressed_data); + source->source.sstable.decompressed_data = NULL; + } + if (source->source.sstable.current_block_data) + { + block_manager_block_release(source->source.sstable.current_block_data); + source->source.sstable.current_block_data = NULL; + } + + /* we loop to handle block read failures by trying next block */ + while (block_manager_cursor_next(source->source.sstable.klog_cursor) == 0) + { + if (source->source.sstable.sst->klog_data_end_offset > 0 && + source->source.sstable.klog_cursor->current_pos >= + source->source.sstable.sst->klog_data_end_offset) + { + /* reached end of data blocks */ + return TDB_ERR_NOT_FOUND; + } + + /* we release any previous cache pin before reading the next block */ + if (source->source.sstable.cache_pin) + { + clock_cache_release(source->source.sstable.cache_pin); + source->source.sstable.cache_pin = NULL; + } + + /* we try block cache first to avoid pread syscall during sequential iteration. + * this mirrors the cache-first pattern used in tidesdb_iter_read_klog_block + * and tidesdb_iter_seek_sstable_source_forward. */ + const uint8_t *data = NULL; + size_t data_size = 0; + uint8_t *decompressed = NULL; + block_manager_block_t *block = NULL; + clock_cache_entry_t *pin = NULL; + tidesdb_sstable_t *sst = source->source.sstable.sst; + const char *cf_name = sst->cf_name; + const int has_cf_name = (cf_name[0] != '\0'); + + if (sst->db && sst->db->clock_cache && has_cf_name) + { + size_t cached_size = 0; + const uint8_t *cached_data = tidesdb_cache_raw_block_get_pinned( + sst->db, cf_name, sst->klog_filename, + source->source.sstable.klog_cursor->current_pos, &cached_size, &pin); + if (cached_data) + { + /* when the cached block has an index (from a prior seek), + * we set up lazy state and parse the first entry via the + * index. this avoids the O(N) full klog_block_deserialize + * and instead uses O(1) per-entry incremental parsing -- + * the same path that seek uses. */ + if (cached_size >= TDB_BLOCK_INDEX_HDR_BASE) + { + const uint32_t maybe_magic = decode_uint32_le_compat(cached_data); + if (maybe_magic == TDB_BLOCK_INDEX_MAGIC) + { + const uint32_t hdr_size = decode_uint32_le_compat(cached_data + 4); + const uint32_t idx_count = decode_uint32_le_compat(cached_data + 8); + if (hdr_size < cached_size && idx_count > 0) + { + const uint8_t *idx_base = cached_data + TDB_BLOCK_INDEX_HDR_BASE; + const uint8_t *bdata = cached_data + hdr_size; + const size_t bdata_size = cached_size - hdr_size; + + /* we parse first entry from block index */ + const uint8_t *fie = idx_base; + const uint32_t e_off = + decode_uint32_le_compat(fie + TDB_BLOCK_IDX_ENTRY_OFF); + const uint32_t mk_off = + decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t mk_sz = + decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_SIZE); + const uint32_t sq_lo = + decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t sq_hi = + decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_HI); + + if (e_off < bdata_size && mk_off <= bdata_size && + mk_sz <= bdata_size - mk_off) + { + const uint8_t *eptr = bdata + e_off; + size_t erem = bdata_size - e_off; + const uint8_t flags = *eptr++; + erem--; + + uint64_t ks, vs; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + uint64_t seq_skip; + br = decode_varint(eptr, &seq_skip, (int)erem); + eptr += br; + erem -= br; + int64_t ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (erem >= sizeof(int64_t)) + { + ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + } + uint64_t vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + uint64_t vo; + decode_varint(eptr, &vo, (int)erem); + vlog_offset = vo; + } + + const uint64_t abs_seq = + ((uint64_t)sq_hi << TDB_U64_HI_LO_SHIFT) | sq_lo; + const uint8_t *key_ptr = bdata + mk_off; + + if (vlog_offset > 0) + { + uint8_t *vlog_value = NULL; + tidesdb_vlog_read_value( + source->source.sstable.db, source->source.sstable.sst, + vlog_offset, (size_t)vs, &vlog_value); + source->current_kv = tidesdb_kv_pair_create( + key_ptr, mk_sz, vlog_value, (size_t)vs, ttl, abs_seq, + flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + } + else + { + const int val_ok = vs <= bdata_size - mk_off - mk_sz; + tidesdb_kv_pair_t *ikv = &source->inline_kv; + ikv->entry.flags = (flags & TDB_KV_TOMBSTONE_FLAG_MASK) | + TDB_KV_FLAG_BORROWED; + ikv->entry.key_size = mk_sz; + ikv->entry.value_size = val_ok ? (uint32_t)vs : 0; + ikv->entry.seq = abs_seq; + ikv->entry.ttl = ttl; + ikv->entry.vlog_offset = 0; + ikv->key = (uint8_t *)key_ptr; + ikv->value = (vs > 0 && val_ok) + ? (uint8_t *)(bdata + mk_off + mk_sz) + : NULL; + source->current_kv = ikv; + } + + /* we set up lazy state so subsequent advance() calls + * parse entries incrementally from the index */ + tidesdb_iter_clear_lazy(source); + source->source.sstable.lazy.data = cached_data; + source->source.sstable.lazy.size = cached_size; + source->source.sstable.lazy.pin = pin; + source->source.sstable.lazy.block_data = bdata; + source->source.sstable.lazy.block_data_size = bdata_size; + source->source.sstable.lazy.idx_base = idx_base; + source->source.sstable.lazy.idx_count = idx_count; + source->source.sstable.lazy.entry_idx = 0; + source->source.sstable.lazy.bmblock = NULL; + source->source.sstable.lazy.decompressed = NULL; + source->source.sstable.current_entry_idx = 0; + /* bdata_size is the decompressed size, not + * the on-disk size cursor_next needs. invalidate + * so cursor_next re-reads the size header. */ + source->source.sstable.klog_cursor->block_size_valid = 0; + return TDB_SUCCESS; + } + } + } + } + + /* non-indexed cache hit -- fall through to full deserialize */ + data = cached_data; + data_size = cached_size; + source->source.sstable.cache_pin = pin; + goto advance_deserialize; + } + } + + block = block_manager_cursor_read(source->source.sstable.klog_cursor); + if (!block) + { + /* block read failed, we try next block */ + continue; + } + + /* block is owned by us, we decompress if needed */ + data = block->data; + data_size = block->size; + + if (source->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + decompressed = decompress_data(block->data, block->size, &decompressed_size, + source->config->compression_algorithm); + if (decompressed) + { + data = decompressed; + data_size = decompressed_size; + /* we keep decompressed buffer, deserialized pointers reference it */ + source->source.sstable.decompressed_data = decompressed; + } + } + + /* populate cache for future iterations over this block. + * we cache raw data here (not indexed) because sequential advance + * reads each block once; building the index would be wasted CPU. + * the seek path builds indexed format on its own cache-insert. */ + if (sst->db && sst->db->clock_cache && has_cf_name) + { + tidesdb_cache_raw_block_put(sst->db, cf_name, sst->klog_filename, + source->source.sstable.klog_cursor->current_pos, data, + data_size); + } + + advance_deserialize: + tidesdb_klog_block_free(source->source.sstable.current_block); + source->source.sstable.current_block = NULL; + + const int deserialize_result = tidesdb_klog_block_deserialize( + data, data_size, &source->source.sstable.current_block, 1); + + if (deserialize_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Klog block deserialization failed (error=%d), " + "trying next block for SSTable %" PRIu64, + deserialize_result, source->source.sstable.sst->id); + if (decompressed) + { + free(decompressed); + source->source.sstable.decompressed_data = NULL; + } + block_manager_block_release(block); + /* deserialization failed, we try next block */ + continue; + } + + if (source->source.sstable.current_block && + source->source.sstable.current_block->num_entries > 0) + { + source->source.sstable.current_entry_idx = 0; + + const tidesdb_klog_block_t *current_kb = source->source.sstable.current_block; + const tidesdb_klog_entry_t *e0 = ¤t_kb->entries[0]; + + if (e0->vlog_offset > 0) + { + uint8_t *vlog_value = NULL; + tidesdb_vlog_read_value(source->source.sstable.db, source->source.sstable.sst, + e0->vlog_offset, e0->value_size, &vlog_value); + source->current_kv = tidesdb_kv_pair_create( + current_kb->keys[0], e0->key_size, vlog_value, e0->value_size, e0->ttl, + e0->seq, e0->flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + } + else + { + tidesdb_kv_pair_t *ikv = &source->inline_kv; + ikv->entry = *e0; + ikv->entry.flags = + (e0->flags & TDB_KV_TOMBSTONE_FLAG_MASK) | TDB_KV_FLAG_BORROWED; + ikv->key = current_kb->keys[0]; + ikv->value = (uint8_t *)current_kb->inline_values[0]; + source->current_kv = ikv; + } + source->source.sstable.current_block_data = block; + return TDB_SUCCESS; + } + + /* empty block or other issue, we clean up and try next block */ + if (decompressed) + { + free(decompressed); + source->source.sstable.decompressed_data = NULL; + } + block_manager_block_release(block); + source->source.sstable.current_block_data = NULL; + } + } + + return TDB_ERR_NOT_FOUND; +} + +/** + * tidesdb_merge_source_retreat + * retreat a merge source + * @param source merge source to retreat + * @return 0 on success, -1 on failure + */ +static int tidesdb_merge_source_retreat(tidesdb_merge_source_t *source) +{ + if (source == NULL) return -1; + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (source->type == MERGE_SOURCE_MEMTABLE) + { + if (skip_list_cursor_prev(source->source.memtable.cursor) == 0) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(source->source.memtable.cursor, &key, &key_size, + &value, &value_size, &ttl, &deleted, &seq) == 0) + { + source->current_kv = + tidesdb_kv_pair_create(key, key_size, value, value_size, ttl, seq, deleted); + return TDB_SUCCESS; + } + } + } + else if (source->type == MERGE_SOURCE_UNIFIED_MEMTABLE) + { + if (skip_list_cursor_prev(source->source.unified.cursor) == 0) + { + if (tidesdb_unified_source_advance_to_cf(source, 0)) + { + return TDB_SUCCESS; + } + } + } + else if (source->type == MERGE_SOURCE_BTREE) + { + if (btree_cursor_prev(source->source.btree.cursor) == 0) + { + uint8_t *key = NULL, *value = NULL; + size_t key_size = 0, value_size = 0; + uint64_t vlog_offset = 0, seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (btree_cursor_get(source->source.btree.cursor, &key, &key_size, &value, &value_size, + &vlog_offset, &seq, &ttl, &deleted) == 0) + { + const uint8_t *actual_value = value; + size_t actual_value_size = value_size; + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_btree_read_vlog_value(source->source.btree.vlog_cursor, vlog_offset, + source->config, &vlog_value, + &actual_value_size, value_size) == 0) + { + actual_value = vlog_value; + } + else + { + /* surface the silent data-integrity event, a failed vlog read here + * writes an empty value into the merge output. with F6 this also + * fires on a value-size mismatch. */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "merge btree vlog read failed (offset=%" PRIu64 + "), value treated as empty in merged output", + vlog_offset); + actual_value = NULL; + actual_value_size = 0; + } + } + + source->current_kv = tidesdb_kv_pair_create(key, key_size, actual_value, + actual_value_size, ttl, seq, deleted); + free(vlog_value); + return TDB_SUCCESS; + } + } + } + else if (source->type == MERGE_SOURCE_TXN_OPS) + { + /* we retreat to previous entry in sorted txn ops index */ + source->source.txn_ops.pos--; + if (source->source.txn_ops.pos >= 0) + { + const int op_idx = source->source.txn_ops.sorted_indices[source->source.txn_ops.pos]; + const tidesdb_txn_op_t *op = &source->source.txn_ops.txn->ops[op_idx]; + + source->current_kv = + tidesdb_kv_pair_create(op->key, op->key_size, op->value, op->value_size, op->ttl, + UINT64_MAX, tidesdb_txn_op_kv_flags(op)); + return TDB_SUCCESS; + } + return TDB_ERR_NOT_FOUND; + } + else + { + /* we move to previous entry in current block or previous block */ + const tidesdb_klog_block_t *kb = source->source.sstable.current_block; + + /* we check if we can move to previous entry in current block */ + if (kb && source->source.sstable.current_entry_idx > 0) + { + /* we move to previous entry in current block */ + source->source.sstable.current_entry_idx--; + const int idx = source->source.sstable.current_entry_idx; + const tidesdb_klog_entry_t *e = &kb->entries[idx]; + + if (e->vlog_offset > 0) + { + uint8_t *vlog_value = NULL; + tidesdb_vlog_read_value(source->source.sstable.db, source->source.sstable.sst, + e->vlog_offset, e->value_size, &vlog_value); + source->current_kv = + tidesdb_kv_pair_create(kb->keys[idx], e->key_size, vlog_value, e->value_size, + e->ttl, e->seq, e->flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + } + else + { + /* zero-copy borrowed */ + tidesdb_kv_pair_t *ikv = &source->inline_kv; + ikv->entry = *e; + ikv->entry.flags = (e->flags & TDB_KV_TOMBSTONE_FLAG_MASK) | TDB_KV_FLAG_BORROWED; + ikv->key = kb->keys[idx]; + ikv->value = (uint8_t *)kb->inline_values[idx]; + source->current_kv = ikv; + } + return TDB_SUCCESS; + } + /** we check if we can move to a previous block */ + if (!block_manager_cursor_has_prev(source->source.sstable.klog_cursor)) + { + /* already at first block, we cant go back */ + return TDB_ERR_NOT_FOUND; + } + + if (source->source.sstable.current_rc_block) + { + tidesdb_block_release(source->source.sstable.current_rc_block); + source->source.sstable.current_rc_block = NULL; + } + else if (source->source.sstable.current_block) + { + tidesdb_klog_block_free(source->source.sstable.current_block); + } + source->source.sstable.current_block = NULL; + if (source->source.sstable.decompressed_data) + { + free(source->source.sstable.decompressed_data); + source->source.sstable.decompressed_data = NULL; + } + if (source->source.sstable.current_block_data) + { + block_manager_block_release(source->source.sstable.current_block_data); + source->source.sstable.current_block_data = NULL; + } + + /* we must loop to handle block read failures by trying previous block */ + while (block_manager_cursor_prev(source->source.sstable.klog_cursor) == 0) + { + /* we check if cursor is past data end offset (into auxiliary structures) */ + if (source->source.sstable.sst->klog_data_end_offset > 0 && + source->source.sstable.klog_cursor->current_pos >= + source->source.sstable.sst->klog_data_end_offset) + { + /* reached end of data blocks (moved into auxiliary structures) */ + return TDB_ERR_NOT_FOUND; + } + + block_manager_block_t *block = + block_manager_cursor_read(source->source.sstable.klog_cursor); + if (!block) + { + /* block read failed, we try previous block */ + continue; + } + + /* block is owned by us, we decompress if needed */ + const uint8_t *data = block->data; + size_t data_size = block->size; + uint8_t *decompressed = NULL; + + if (source->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + decompressed = decompress_data(block->data, block->size, &decompressed_size, + source->config->compression_algorithm); + if (decompressed) + { + data = decompressed; + data_size = decompressed_size; + /* we keep decompressed buffer, deserialized pointers reference it */ + source->source.sstable.decompressed_data = decompressed; + } + } + + tidesdb_klog_block_free(source->source.sstable.current_block); + source->source.sstable.current_block = NULL; + + const int deserialize_result = tidesdb_klog_block_deserialize( + data, data_size, &source->source.sstable.current_block, 1); + + if (deserialize_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Klog block deserialization failed (error=%d), " + "trying previous block for SSTable %" PRIu64, + deserialize_result, source->source.sstable.sst->id); + if (decompressed) + { + free(decompressed); + source->source.sstable.decompressed_data = NULL; + } + block_manager_block_release(block); + /* deserialization failed, we try previous block */ + continue; + } + + if (source->source.sstable.current_block && + source->source.sstable.current_block->num_entries > 0) + { + /* deserialization succeeded? its now safe to store block */ + source->source.sstable.current_block_data = block; + + /* we start at last entry of previous block */ + source->source.sstable.current_entry_idx = + (int)(source->source.sstable.current_block->num_entries - 1); + + const tidesdb_klog_block_t *current_kb = source->source.sstable.current_block; + const int idx = source->source.sstable.current_entry_idx; + const uint8_t *value = current_kb->inline_values[idx]; + + uint8_t *vlog_value = NULL; + if (current_kb->entries[idx].vlog_offset > 0) + { + tidesdb_vlog_read_value(source->source.sstable.db, source->source.sstable.sst, + current_kb->entries[idx].vlog_offset, + current_kb->entries[idx].value_size, &vlog_value); + value = vlog_value; + } + + source->current_kv = tidesdb_kv_pair_create( + current_kb->keys[idx], current_kb->entries[idx].key_size, value, + current_kb->entries[idx].value_size, current_kb->entries[idx].ttl, + current_kb->entries[idx].seq, + current_kb->entries[idx].flags & TDB_KV_TOMBSTONE_FLAG_MASK); + + free(vlog_value); + return TDB_SUCCESS; + } + + /* empty block or other issue, clean up and try previous block */ + if (decompressed) + { + free(decompressed); + source->source.sstable.decompressed_data = NULL; + } + block_manager_block_release(block); + } + } + + return TDB_ERR_NOT_FOUND; +} + +/** + * tidesdb_calculate_level_capacity + * calculate the capacity of a level based on the level number, base capacity, and ratio + * used for initial level sizing. once data is written, DCA (Dynamic Capacity + * Adaptation) will adjust capacities using the formula C_i = N_L / T^(L-i) where N_L is the + * actual data size at the largest level. This initial formula C_i = base * T^(i-1) provides + * a reasonable starting point that grows exponentially with the size ratio. + * @param level_num the level number (1-indexed) + * @param base_capacity the base capacity (typically write_buffer_size) + * @param ratio the size ratio (T) + * @return the capacity of the level + */ +static size_t tidesdb_calculate_level_capacity(const int level_num, const size_t base_capacity, + const size_t ratio) +{ + /*** initial capacity formula + * C_i = base * T^(i-1) for level i + * l1 -- base * T^0 = base + * l2 -- base * T^1 = base * T + * l3 -- base * T^2 = base * T^2 + * will be adjusted by DCA once data is written + * uses overflow checking to prevent wraparound */ + size_t capacity = base_capacity; + const size_t max_capacity = SIZE_MAX / 2; /* cap at half of SIZE_MAX for safety */ + + for (int i = 1; i < level_num; i++) + { + /* we must check for overflow before multiplication */ + if (capacity > max_capacity / ratio) + { + /* would overflow -- saturate at max_capacity */ + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "Level capacity calculation would overflow at level %d, saturating at %zu", + level_num, max_capacity); + return max_capacity; + } + capacity *= ratio; + } + return capacity; +} + +/** + * tidesdb_add_level + * add a new level to the column family + * @param cf the column family + * @return TDB_SUCCESS on success, TDB_ERR_MEMORY on failure + */ +static int tidesdb_add_level(tidesdb_column_family_t *cf) +{ + int old_num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + if (old_num_levels >= TDB_MAX_LEVELS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Cannot add level - already at max (%d)", TDB_MAX_LEVELS); + return TDB_ERR_INVALID_ARGS; + } + + if (old_num_levels > 0) + { + tidesdb_level_t *largest = cf->levels[old_num_levels - 1]; + size_t largest_size = atomic_load_explicit(&largest->current_size, memory_order_relaxed); + size_t largest_capacity = atomic_load_explicit(&largest->capacity, memory_order_relaxed); + int num_sstables = atomic_load_explicit(&largest->num_sstables, memory_order_acquire); + + /* we recheck if largest level still needs expansion */ + if (num_sstables == 0 && largest_size < largest_capacity) + { + return TDB_SUCCESS; + } + } + + /* we calculate capacity for new level */ + size_t new_capacity = tidesdb_calculate_level_capacity( + old_num_levels + 1, cf->config.write_buffer_size, cf->config.level_size_ratio); + + /* a previously removed level may be parked in this slot. reusing it keeps + * the level struct from ever being freed mid-life, so lock-free readers + * iterating cf->levels cannot dereference freed memory. a parked level is + * always empty (remove only parks empty levels) so only capacity needs + * resetting; otherwise we create a fresh level at the next slot. */ + tidesdb_level_t *new_level = cf->levels[old_num_levels]; + if (new_level) + { + atomic_store_explicit(&new_level->capacity, new_capacity, memory_order_release); + atomic_store_explicit(&new_level->current_size, 0, memory_order_release); + } + else + { + new_level = tidesdb_level_create(old_num_levels + 1, new_capacity); + if (!new_level) + { + return TDB_ERR_MEMORY; + } + cf->levels[old_num_levels] = new_level; + } + + /* new level is empty -- data will flow down naturally through compaction. + * old largest level keeps its ssts. + * + * spooky paper (algorithm 1) suggests moving data from old + * largest to new largest during level addition. we intentionally do not do this + * because it causes key loss and breaks the LSM-tree structure. instead, we let + * normal compaction move data down, which is simpler and correct. */ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Added empty level %d, old largest level %d keeps its data", + new_level->level_num, old_num_levels); + + /* we atomically increment active level count -- this publishes the new level + * release ordering ensures the new level is visible to other threads */ + atomic_store_explicit(&cf->num_active_levels, old_num_levels + 1, memory_order_release); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Published %d active levels", old_num_levels + 1); + for (int log_i = 0; log_i < old_num_levels + 1; log_i++) + { + tidesdb_level_t *log_lvl = cf->levels[log_i]; + if (log_lvl) + { + int log_num = atomic_load_explicit(&log_lvl->num_sstables, memory_order_acquire); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Levels[%d] level_num=%d, %d SSTables", log_i, + log_lvl->level_num, log_num); + } + } + + /* we must ensure level addition is visible to all threads */ + atomic_thread_fence(memory_order_release); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Added level %d, now have %d levels", new_level->level_num, + old_num_levels + 1); + + return TDB_SUCCESS; +} + +/** + * tidesdb_remove_level + * remove the last level from the column family + * @param cf the column family + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS on failure + */ +static int tidesdb_remove_level(tidesdb_column_family_t *cf) +{ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Attempting to remove level from CF '%s'", cf->name); + int old_num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + /* we enforce minimum levels! never go below min_levels, the floor */ + if (old_num_levels <= cf->config.min_levels) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "At minimum levels (%d <= %d), not removing", old_num_levels, + cf->config.min_levels); + return TDB_SUCCESS; /* not an error, just at minimum */ + } + + tidesdb_level_t *largest = cf->levels[old_num_levels - 1]; + int num_largest_ssts = atomic_load_explicit(&largest->num_sstables, memory_order_acquire); + + /* we only remove level if it's completely empty */ + if (num_largest_ssts > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Cannot remove level %d - has %d SSTables", largest->level_num, + num_largest_ssts); + return TDB_SUCCESS; + } + + /** we update capacity of new largest level (was L-1, now L) + * C_new_L = C_old_L / T */ + int new_num_levels = old_num_levels - 1; + if (new_num_levels > 0) + { + tidesdb_level_t *new_largest = cf->levels[new_num_levels - 1]; + size_t old_largest_capacity = + atomic_load_explicit(&largest->capacity, memory_order_relaxed); + size_t new_largest_capacity = old_largest_capacity / cf->config.level_size_ratio; + + if (new_largest_capacity < cf->config.write_buffer_size) + { + new_largest_capacity = cf->config.write_buffer_size; + } + + atomic_store_explicit(&new_largest->capacity, new_largest_capacity, memory_order_release); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Updated new largest level %d capacity to %zu", + new_largest->level_num, new_largest_capacity); + } + + /* we do not free the removed level. lock-free readers iterate cf->levels up + * to a possibly stale num_active_levels, so freeing the struct here would be + * a use after free. instead the empty level stays parked in its slot and + * tidesdb_add_level reuses it, which bounds level structs to TDB_MAX_LEVELS + * per cf; tidesdb_column_family_free frees them all at close. */ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Parking removed empty level %d for reuse", largest->level_num); + + /* we update num_active_levels to reflect removed level + * release ordering ensures the level removal is visible to other threads */ + atomic_store_explicit(&cf->num_active_levels, new_num_levels, memory_order_release); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Removed level, now have %d levels", new_num_levels); + + tidesdb_apply_dca(cf); + + return TDB_SUCCESS; +} + +/** + * tidesdb_apply_dca + * apply dynamic capacity adaptation to the column family + * @param cf the column family + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS on failure + */ +static int tidesdb_apply_dca(tidesdb_column_family_t *cf) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (num_levels < 2) + { + return TDB_SUCCESS; + } + + /* we get data size at largest level */ + tidesdb_level_t *largest = cf->levels[num_levels - 1]; + size_t N_L = atomic_load(&largest->current_size); + + /* we update capacities C_i = N_L / T^(L-i) + * paper uses 1-based level numbering (level 1, 2, 3...) + * we use 0-based array indexing (levels[0], levels[1], levels[2]...) + * so we adjust -- for array index i, the level number is i+1 + * formula becomes -- C[i] = N_L / T^(L-(i+1)) = N_L / T^(L-1-i) */ + for (int i = 0; i < num_levels - 1; i++) + { + size_t power = num_levels - 1 - i; /* L - 1 - i (adjusted for 0-based indexing) */ + const size_t ratio = cf->config.level_size_ratio; + size_t divisor = 1; + int divisor_overflow = 0; + /* ratio <= 1 leaves divisor == 1 (no leveling, and avoids a divide-by-zero when + * ratio == 0); otherwise guard the running product against size_t overflow -- with + * ratio 10 and a deep tree, T^power exceeds size_t past ~19 levels and would wrap. */ + for (size_t p = 0; p < power && ratio > 1; p++) + { + if (divisor > SIZE_MAX / ratio) + { + divisor_overflow = 1; + break; + } + divisor *= ratio; + } + + size_t old_capacity = atomic_load_explicit(&cf->levels[i]->capacity, memory_order_acquire); + /* an overflowed divisor means N_L / divisor underflows toward 0; floor to the write + * buffer size, same as the normal small-capacity case below */ + size_t new_capacity = divisor_overflow ? cf->config.write_buffer_size : N_L / divisor; + + if (new_capacity < cf->config.write_buffer_size) + { + new_capacity = cf->config.write_buffer_size; + } + + if (new_capacity != old_capacity) + { + atomic_store_explicit(&cf->levels[i]->capacity, new_capacity, memory_order_release); + } + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_snapshot_sst_ids + * snapshot sstable IDs from a range of levels to prevent race with flush workers + * @param cf the column family + * @param start_level start level (0-indexed) + * @param end_level end level (0-indexed, inclusive) + * @return queue of uint64_t* IDs, or NULL on failure + */ +static queue_t *tidesdb_snapshot_sst_ids(const tidesdb_column_family_t *cf, const int start_level, + const int end_level) +{ + queue_t *snapshot = queue_new(); + if (!snapshot) return NULL; + + for (int level = start_level; level <= end_level; level++) + { + tidesdb_level_t *lvl = cf->levels[level]; + + /* we hold array_readers to prevent retire_array from freeing the array + * while we iterate -- a concurrent flush on L1 can swap the array and + * a second flush would free the one we loaded without this guard */ + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + + const int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **sstables = atomic_load_explicit(&lvl->sstables, memory_order_acquire); + + for (int i = 0; i < num_ssts; i++) + { + tidesdb_sstable_t *sst = sstables[i]; + if (!sst) continue; + + uint64_t *id_copy = malloc(sizeof(uint64_t)); + if (id_copy) + { + *id_copy = sst->id; + queue_enqueue(snapshot, id_copy); + } + } + + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + } + + return snapshot; +} + +/** + * tidesdb_cleanup_snapshot_ids + * free all IDs in a snapshot queue and the queue itself + * @param snapshot the snapshot queue to cleanup + */ +static void tidesdb_cleanup_snapshot_ids(queue_t *snapshot) +{ + if (!snapshot) return; + + while (queue_size(snapshot) > 0) + { + uint64_t *id_ptr = (uint64_t *)queue_dequeue(snapshot); + free(id_ptr); + } + queue_free(snapshot); +} + +/** + * tidesdb_sst_in_snapshot + * check if an sstable ID is in the snapshot + * @param snapshot the snapshot queue + * @param sst_id the sstable ID to check + * @return 1 if in snapshot, 0 otherwise + */ +static int tidesdb_sst_in_snapshot_array(const uint64_t *const *ids, size_t count, + const uint64_t sst_id) +{ + for (size_t j = 0; j < count; j++) + { + if (ids[j] && *ids[j] == sst_id) return 1; + } + return 0; +} + +/** + * tidesdb_collect_ssts_from_snapshot + * collect sstables matching snapshot IDs with references + * @param cf the column family + * @param start_level start level (0-indexed) + * @param end_level end level (0-indexed, inclusive) + * @param snapshot the snapshot queue of IDs + * @param ssts_out output array of sstables (caller must free) + * @param count_out output count of sstables + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_collect_ssts_from_snapshot(const tidesdb_column_family_t *cf, + const int start_level, const int end_level, + queue_t *snapshot, tidesdb_sstable_t ***ssts_out, + int *count_out) +{ + *ssts_out = NULL; + *count_out = 0; + + const size_t snapshot_size = queue_size(snapshot); + if (snapshot_size == 0) return TDB_SUCCESS; + + /* we snapshot the ID queue into an array once to avoid O(n^2) queue_peek_at */ + const uint64_t **snap_ids = malloc(snapshot_size * sizeof(uint64_t *)); + if (!snap_ids) return TDB_ERR_MEMORY; + + const size_t snap_count = queue_snapshot(snapshot, (void **)snap_ids, snapshot_size); + + tidesdb_sstable_t **ssts_array = malloc(snapshot_size * sizeof(tidesdb_sstable_t *)); + if (!ssts_array) + { + free(snap_ids); + return TDB_ERR_MEMORY; + } + + int sst_idx = 0; + + for (int level = start_level; level <= end_level; level++) + { + tidesdb_level_t *lvl = cf->levels[level]; + + /* we hold array_readers to prevent retire_array from freeing the array + * while we iterate -- a concurrent flush on L1 can swap the array and + * a second flush would free the one we loaded without this guard */ + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + + const int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **sstables = atomic_load_explicit(&lvl->sstables, memory_order_acquire); + + for (int i = 0; i < num_ssts; i++) + { + tidesdb_sstable_t *sst = sstables[i]; + if (!sst) continue; + + if (tidesdb_sst_in_snapshot_array(snap_ids, snap_count, sst->id)) + { + tidesdb_sstable_ref(sst); + ssts_array[sst_idx++] = sst; + } + } + + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + } + + free(snap_ids); + + *ssts_out = ssts_array; + *count_out = sst_idx; + return TDB_SUCCESS; +} + +/** + * tidesdb_add_ssts_to_merge_heap + * create merge sources from sstables and add to heap + * @param db the database + * @param ssts array of sstables + * @param count number of sstables + * @param heap the merge heap + * @param delete_queue queue to add sstables for later deletion + */ +static void tidesdb_add_ssts_to_merge_heap(tidesdb_t *db, tidesdb_column_family_t *cf, + tidesdb_sstable_t **ssts, const int count, + tidesdb_merge_heap_t *heap, queue_t *delete_queue) +{ + for (int i = 0; i < count; i++) + { + if (cf && tidesdb_cf_abort_requested(cf)) break; + tidesdb_sstable_t *sst = ssts[i]; + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Creating merge source for SSTable %" PRIu64 " (num_klog_blocks=%" PRIu64 + ", klog_data_end_offset=%" PRIu64 ")", + sst->id, sst->num_klog_blocks, sst->klog_data_end_offset); + + tidesdb_merge_source_t *source = tidesdb_merge_source_from_sstable(db, sst); + if (source) + { + if (source->current_kv) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Added merge source for SSTable %" PRIu64, sst->id); + if (tidesdb_merge_heap_add_source(heap, source) != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Failed to add merge source for SSTable %" PRIu64 " to heap", + sst->id); + tidesdb_merge_source_free(source); + } + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Merge source for SSTable %" PRIu64 " has no current_kv, skipping", + sst->id); + tidesdb_merge_source_free(source); + } + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create merge source for SSTable %" PRIu64, + sst->id); + } + + queue_enqueue(delete_queue, sst); + } +} + +/** + * tidesdb_cleanup_merged_sstables + * remove old sstables from levels and manifest after merge + * @param cf the column family + * @param delete_queue queue of sstables to delete + * @param start_level start level (0-indexed) + * @param end_level end level (0-indexed, inclusive) + */ +static void tidesdb_cleanup_merged_sstables(tidesdb_column_family_t *cf, queue_t *delete_queue, + const int start_level, const int end_level) +{ + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + const int total = queue_size(delete_queue); + if (total <= 0) return; + + /* we drain the queue into one array so each level's merged inputs are excised in a + * single atomic swap. removing them one at a time leaves a window where a level holds + * an input's older put without its tombstone, and a concurrent point get -- which + * stops at the first level that has the key -- returns that orphaned put, so a deleted + * key reappears until compaction settles. */ + tidesdb_sstable_t **ssts = malloc((size_t)total * sizeof(tidesdb_sstable_t *)); + if (!ssts) + { + /* alloc failed -- last-resort one-at-a-time removal */ + while (!queue_is_empty(delete_queue)) + { + tidesdb_sstable_t *sst = queue_dequeue(delete_queue); + if (!sst) continue; + atomic_store_explicit(&sst->marked_for_deletion, 1, memory_order_release); + if (!tidesdb_cf_abort_requested(cf)) + { + for (int level = start_level; level <= end_level && level < num_levels; level++) + { + if (tidesdb_level_remove_sstable(cf->db, cf->levels[level], sst) == TDB_SUCCESS) + { + tidesdb_bump_sstable_layout_version(cf); + break; + } + } + } + tidesdb_sstable_unref(cf->db, sst); + } + return; + } + + int n = 0; + while (!queue_is_empty(delete_queue)) + { + tidesdb_sstable_t *sst = queue_dequeue(delete_queue); + if (sst) ssts[n++] = sst; + } + + for (int i = 0; i < n; i++) + atomic_store_explicit(&ssts[i]->marked_for_deletion, 1, memory_order_release); + + /* drop_column_family will sweep the cf directory shortly; skip the level/manifest work + * when the CF is on its way out, but still release our queue references below */ + int cleanup_commit_ok = 1; + if (!tidesdb_cf_abort_requested(cf)) + { + uint8_t *removed = calloc((size_t)n, 1); + int *removed_level = malloc((size_t)n * sizeof(int)); + if (removed && removed_level) + { + for (int i = 0; i < n; i++) removed_level[i] = -1; + + /* we remove input levels deepest-first. for any key, its tombstone input sits at + * a level shallower-or-equal to its older put input, so removing deep before + * shallow guarantees that whenever a put input is gone its tombstone input is + * still present (or the merged output is reachable) -- a concurrent get can never + * see the orphaned put alone. */ + int deepest = (end_level < num_levels - 1) ? end_level : num_levels - 1; + for (int level = deepest; level >= start_level; level--) + { + tidesdb_level_t *lvl = cf->levels[level]; + tidesdb_level_remove_sstables_batch(cf->db, lvl, ssts, n, removed); + for (int i = 0; i < n; i++) + { + if (removed[i] && removed_level[i] == -1) removed_level[i] = lvl->level_num; + } + } + + int any_removed = 0; + for (int i = 0; i < n; i++) + { + if (removed[i]) + { + any_removed = 1; + tidesdb_manifest_remove_sstable(cf->manifest, removed_level[i], ssts[i]->id); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "SSTable %" PRIu64 " not found in any level", + ssts[i]->id); + } + } + + if (any_removed) + { + tidesdb_bump_sstable_layout_version(cf); + if (tidesdb_manifest_commit(cf->manifest, cf->manifest->path) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to commit manifest after merge cleanup"); + cleanup_commit_ok = 0; + } + else + { + tdb_objstore_upload_manifest(cf->db, cf); + } + } + } + free(removed); + free(removed_level); + } + + /* if the cleanup commit failed the inputs are still in the persisted manifest, so keep + * their files on disk (clear the deletion mark before the final unref frees them) -- + * recovery loads them instead of finding the manifest reference an orphaned file. the + * merged output already covers the data; this only matters under sustained commit failure. */ + if (!cleanup_commit_ok) + for (int i = 0; i < n; i++) + atomic_store_explicit(&ssts[i]->marked_for_deletion, 0, memory_order_release); + + for (int i = 0; i < n; i++) tidesdb_sstable_unref(cf->db, ssts[i]); + free(ssts); +} + +/** + * tidesdb_subcompaction_t + * shared coordination state for running a single compaction round's independent partition + * sub-merges across multiple ephemeral helper threads. each partition is a disjoint key range + * with its own heap/output; workers steal partitions via next_partition and each calls + * run_partition, which performs that partition's commit under cf->compaction_commit_lock. + * @param db database (for the helper-thread budget) + * @param merge_ctx opaque per-merge context passed to run_partition + * @param run_partition per-partition worker; returns TDB_SUCCESS or a hard error + * @param num_partitions number of partitions to process + * @param next_partition work-stealing cursor + * @param aborted set when a partition observes an external abort (e.g. CF drop) + * @param error first hard error code observed across partitions (TDB_SUCCESS = none) + */ +typedef struct +{ + tidesdb_t *db; + void *merge_ctx; + int (*run_partition)(void *merge_ctx, int partition); + int num_partitions; + _Atomic(int) next_partition; + _Atomic(int) aborted; + _Atomic(int) error; +} tidesdb_subcompaction_t; + +/** + * tidesdb_subcompaction_worker + * helper-thread body-- steal partition indices and run each until exhausted or aborted + */ +static void *tidesdb_subcompaction_worker(void *arg) +{ + tidesdb_subcompaction_t *sc = (tidesdb_subcompaction_t *)arg; + tdb_set_thread_name("tdb-subcompact"); + for (;;) + { + if (atomic_load_explicit(&sc->aborted, memory_order_acquire)) break; + const int p = atomic_fetch_add_explicit(&sc->next_partition, 1, memory_order_acq_rel); + if (p >= sc->num_partitions) break; + const int rc = sc->run_partition(sc->merge_ctx, p); + if (rc != TDB_SUCCESS) + { + int expected = TDB_SUCCESS; + atomic_compare_exchange_strong_explicit(&sc->error, &expected, rc, memory_order_acq_rel, + memory_order_relaxed); + } + } + return NULL; +} + +/** + * tidesdb_run_subcompactions + * run num_partitions independent partition merges concurrently. borrows up to + * (num_partitions - 1) helper threads from db->compaction_helper_budget (bounded so parallel + * rounds across CFs never oversubscribe the pool); the calling thread also works, so progress + * is guaranteed even when the budget is zero or pthread_create fails (work is stolen, never + * dropped). run_partition owns each partition's heap/output and commits under the CF lock. + * @return the first hard error from any partition, or TDB_SUCCESS + */ +static int tidesdb_run_subcompactions(tidesdb_t *db, void *merge_ctx, + int (*run_partition)(void *, int), int num_partitions) +{ + if (num_partitions <= 0) return TDB_SUCCESS; + + tidesdb_subcompaction_t sc; + sc.db = db; + sc.merge_ctx = merge_ctx; + sc.run_partition = run_partition; + sc.num_partitions = num_partitions; + atomic_init(&sc.next_partition, 0); + atomic_init(&sc.aborted, 0); + atomic_init(&sc.error, TDB_SUCCESS); + + /* borrow helpers from the global budget; the calling thread is always an extra worker so we + * never need more than num_partitions - 1 helpers. a CAS loop claims whatever is available. */ + int want = num_partitions - 1; + int helpers = 0; + if (want > 0) + { + int avail = atomic_load_explicit(&db->compaction_helper_budget, memory_order_acquire); + while (avail > 0) + { + const int claim = (want < avail) ? want : avail; + if (atomic_compare_exchange_weak_explicit(&db->compaction_helper_budget, &avail, + avail - claim, memory_order_acq_rel, + memory_order_acquire)) + { + helpers = claim; + break; + } + } + } + + pthread_t *threads = (helpers > 0) ? malloc((size_t)helpers * sizeof(pthread_t)) : NULL; + int launched = 0; + for (int i = 0; threads && i < helpers; i++) + { + if (pthread_create(&threads[launched], NULL, tidesdb_subcompaction_worker, &sc) == 0) + launched++; + } + + /* the calling thread participates as a worker too -- guarantees forward progress */ + tidesdb_subcompaction_worker(&sc); + + for (int i = 0; i < launched; i++) pthread_join(threads[i], NULL); + free(threads); + + /* return exactly what we claimed (failed pthread_create leaves work to the stealers) */ + if (helpers > 0) + atomic_fetch_add_explicit(&db->compaction_helper_budget, helpers, memory_order_release); + + return atomic_load_explicit(&sc.error, memory_order_acquire); +} + +/** + * tidesdb_full_preemptive_ctx_t / _shard + * shared read-only context for a full preemptive merge's parallel shards (RocksDB-style + * subcompactions). the single-output merge is split into key-range shards whose boundaries are + * sampled from the input sstables' min keys; each shard builds its own heap from overlapping + * inputs, range-filters the merge, and writes its own output sstable at output_level. the commit + * is serialized on cf->compaction_commit_lock; per-merge teardown (input cleanup) runs once after + * the shards join. the btree branch writes the shard heap unfiltered, matching + * dividing/partitioned. + */ +typedef struct +{ + tidesdb_column_family_t *cf; + int start_level; + int target_level; + int output_level; + int is_largest_level; + skip_list_comparator_fn comparator_fn; + void *comparator_ctx; + tidesdb_sstable_t **del_snap; + size_t del_snap_count; + uint8_t **boundaries; + size_t *boundary_sizes; + int num_boundaries; + uint64_t min_snapshot_seq; + queue_t *sstables_to_delete; + _Atomic(int) aborted; +} tidesdb_full_preemptive_ctx_t; + +static int tidesdb_full_preemptive_shard(void *vctx, int shard); + +/** + * tidesdb_full_preemptive_merge + * perform a full preemptive merge on the column family + * @param cf the column family + * @param start_level the shallowest input level (0-indexed) + * @param target_level the deepest input level (0-indexed) + * @param output_level the level the merged run is written to (0-indexed). + * normally equal to target_level; for a level-collapse + * merge it is one level shallower than target_level + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS on failure + */ +static int tidesdb_full_preemptive_merge(tidesdb_column_family_t *cf, int start_level, + int target_level, int output_level) +{ + if (tidesdb_cf_abort_requested(cf)) return TDB_SUCCESS; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + if (start_level < 0 || target_level >= num_levels || output_level < 0 || + output_level > target_level) + { + return TDB_ERR_INVALID_ARGS; + } + + /* we determine if we're merging into the largest (bottommost) level + * tombstones can only be dropped when merging into the largest level + * because there's no lower level that might contain the data being deleted */ + const int is_largest_level = (target_level == num_levels - 1); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Starting full preemptive merge on CF '%s', levels %d->%d", + cf->name, start_level + 1, target_level + 1); + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + + tidesdb_merge_heap_t *heap = tidesdb_merge_heap_create(comparator_fn, comparator_ctx); + if (!heap) return TDB_ERR_MEMORY; + + queue_t *sstables_to_delete = queue_new(); + if (!sstables_to_delete) + { + tidesdb_merge_heap_free(heap); + return TDB_ERR_MEMORY; + } + + queue_t *sstable_ids_snapshot = tidesdb_snapshot_sst_ids(cf, start_level, target_level); + if (!sstable_ids_snapshot) + { + tidesdb_merge_heap_free(heap); + queue_free(sstables_to_delete); + return TDB_ERR_MEMORY; + } + + if (queue_size(sstable_ids_snapshot) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "No SSTables to merge, skipping"); + tidesdb_merge_heap_free(heap); + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return TDB_SUCCESS; + } + + tidesdb_sstable_t **ssts_array = NULL; + int sst_count = 0; + int collect_result = tidesdb_collect_ssts_from_snapshot( + cf, start_level, target_level, sstable_ids_snapshot, &ssts_array, &sst_count); + if (collect_result != TDB_SUCCESS) + { + tidesdb_merge_heap_free(heap); + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return collect_result; + } + + /*** we prefetch input sstables in parallel when object store mode is active + ** and object_prefetch_compaction is enabled. this avoids serial on-demand + * downloads during merge source creation. */ + if (cf->db->object_store && cf->config.object_prefetch_compaction) + { + tdb_objstore_prefetch_sstables(cf->db, ssts_array, sst_count); + } + + /* sub-compaction sharding, the single output is split into key-range shards, + * each merged in parallel. boundaries come from the output level's existing sstable min keys + * (already sorted and non-overlapping -- the same source dividing_merge uses). an empty output + * level yields one shard, i.e. the original single-output behaviour with no regression. inputs + * are enqueued for cleanup and snapshotted into an array each shard reads from. */ + for (int i = 0; i < sst_count; i++) queue_enqueue(sstables_to_delete, ssts_array[i]); + free(ssts_array); + tidesdb_merge_heap_free(heap); /* setup heap is unused -- each shard builds its own */ + + const size_t fp_del_count = queue_size(sstables_to_delete); + tidesdb_sstable_t **fp_del_snap = + malloc((fp_del_count ? fp_del_count : 1) * sizeof(tidesdb_sstable_t *)); + if (!fp_del_snap) + { + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, start_level, target_level); + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return TDB_ERR_MEMORY; + } + const size_t fp_del_n = queue_snapshot(sstables_to_delete, (void **)fp_del_snap, fp_del_count); + + int fp_num_boundaries = 0; + uint8_t **fp_boundaries = NULL; + size_t *fp_boundary_sizes = NULL; + { + tidesdb_level_t *out_lvl = cf->levels[output_level]; + atomic_fetch_add_explicit(&out_lvl->array_readers, 1, memory_order_acq_rel); + const int out_n = atomic_load_explicit(&out_lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **out_ssts = + atomic_load_explicit(&out_lvl->sstables, memory_order_acquire); + if (out_n > 0) + { + fp_boundaries = malloc((size_t)out_n * sizeof(uint8_t *)); + fp_boundary_sizes = malloc((size_t)out_n * sizeof(size_t)); + } + if (fp_boundaries && fp_boundary_sizes) + { + /* boundaries are the output sstables' min keys, skipping the first so keys below it + * land in shard 0 (range_start = NULL). they MUST be strictly increasing to form a + * valid key-range partition -- the first disk level holds overlapping runs added in + * flush-completion order, not key order, so the array is not sorted. accept a min key + * only when it exceeds the last accepted boundary; the resulting monotonic subset is a + * coarser-but-always-correct tiling (shard 0's NULL start and the last shard's NULL end + * guarantee full coverage, so no key range is ever dropped). */ + uint8_t *last_b = NULL; + size_t last_bsz = 0; + for (int i = 1; i < out_n; i++) + { + tidesdb_sstable_t *s = out_ssts[i]; + if (!s || !s->min_key || s->min_key_size == 0) continue; + if (last_b && comparator_fn(s->min_key, s->min_key_size, last_b, last_bsz, + comparator_ctx) <= 0) + continue; + fp_boundaries[fp_num_boundaries] = malloc(s->min_key_size); + if (fp_boundaries[fp_num_boundaries]) + { + memcpy(fp_boundaries[fp_num_boundaries], s->min_key, s->min_key_size); + fp_boundary_sizes[fp_num_boundaries] = s->min_key_size; + last_b = fp_boundaries[fp_num_boundaries]; + last_bsz = s->min_key_size; + fp_num_boundaries++; + } + } + } + atomic_fetch_sub_explicit(&out_lvl->array_readers, 1, memory_order_release); + } + + tidesdb_full_preemptive_ctx_t fctx; + fctx.cf = cf; + fctx.start_level = start_level; + fctx.target_level = target_level; + fctx.output_level = output_level; + fctx.is_largest_level = is_largest_level; + fctx.comparator_fn = comparator_fn; + fctx.comparator_ctx = comparator_ctx; + fctx.del_snap = fp_del_snap; + fctx.del_snap_count = fp_del_n; + fctx.boundaries = fp_boundaries; + fctx.boundary_sizes = fp_boundary_sizes; + fctx.num_boundaries = fp_num_boundaries; + fctx.min_snapshot_seq = 0; + fctx.sstables_to_delete = sstables_to_delete; + atomic_init(&fctx.aborted, 0); + + /* run the shards across the sub-compaction helper pool (calling thread works too); each shard + * commits its own output under cf->compaction_commit_lock */ + tidesdb_run_subcompactions(cf->db, &fctx, tidesdb_full_preemptive_shard, fp_num_boundaries + 1); + + const int fp_aborted = atomic_load_explicit(&fctx.aborted, memory_order_acquire); + + for (int i = 0; i < fp_num_boundaries; i++) free(fp_boundaries[i]); + free(fp_boundaries); + free(fp_boundary_sizes); + free(fp_del_snap); + + if (fp_aborted) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' aborting full preemptive merge", cf->name); + while (!queue_is_empty(sstables_to_delete)) + { + tidesdb_sstable_t *sst = queue_dequeue(sstables_to_delete); + if (sst) tidesdb_sstable_unref(cf->db, sst); + } + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return TDB_SUCCESS; + } + + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, start_level, target_level); + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Full preemptive merge complete for CF '%s'", cf->name); + return TDB_SUCCESS; +} + +/** + * tidesdb_full_preemptive_shard + * one key-range shard of a full preemptive merge (see ctx doc above). builds a heap from the + * inputs overlapping its range, then runs the original single-output merge body range-filtered, + * writing one output sstable at output_level. wrapped in do/while(0), a top-level break/continue + * skips this shard; the abort paths set the shared aborted flag. + */ +static int tidesdb_full_preemptive_shard(void *vctx, int shard) +{ + tidesdb_full_preemptive_ctx_t *c = (tidesdb_full_preemptive_ctx_t *)vctx; + tidesdb_column_family_t *cf = c->cf; + const int start_level = c->start_level; + const int target_level = c->target_level; + const int output_level = c->output_level; + const int is_largest_level = c->is_largest_level; + skip_list_comparator_fn comparator_fn = c->comparator_fn; + void *comparator_ctx = c->comparator_ctx; + tidesdb_sstable_t **del_snap = c->del_snap; + const size_t del_snap_count = c->del_snap_count; + uint8_t **boundaries = c->boundaries; + size_t *boundary_sizes = c->boundary_sizes; + const int num_boundaries = c->num_boundaries; + queue_t *sstables_to_delete = c->sstables_to_delete; + int aborted = 0; + (void)start_level; + (void)target_level; + + do + { + if (tidesdb_cf_abort_requested(cf)) + { + aborted = 1; + break; + } + + uint8_t *range_start = (shard > 0) ? boundaries[shard - 1] : NULL; + size_t range_start_size = (shard > 0) ? boundary_sizes[shard - 1] : 0; + uint8_t *range_end = (shard < num_boundaries) ? boundaries[shard] : NULL; + size_t range_end_size = (shard < num_boundaries) ? boundary_sizes[shard] : 0; + + tidesdb_merge_heap_t *heap = tidesdb_merge_heap_create(comparator_fn, comparator_ctx); + if (!heap) break; + + uint64_t estimated_entries = 0; + for (size_t i = 0; i < del_snap_count; i++) + { + tidesdb_sstable_t *sst = del_snap[i]; + if (!sst) continue; + int overlaps = 1; + if (range_start && comparator_fn(sst->max_key, sst->max_key_size, range_start, + range_start_size, comparator_ctx) < 0) + overlaps = 0; + if (overlaps && range_end && + comparator_fn(sst->min_key, sst->min_key_size, range_end, range_end_size, + comparator_ctx) >= 0) + overlaps = 0; + if (overlaps) + { + tidesdb_merge_source_t *source = tidesdb_merge_source_from_sstable(cf->db, sst); + if (source) + { + if (source->current_kv && + tidesdb_merge_heap_add_source(heap, source) == TDB_SUCCESS) + estimated_entries += sst->num_entries; + else + tidesdb_merge_source_free(source); + } + } + } + if (estimated_entries < TDB_MERGE_MIN_ESTIMATED_ENTRIES) + estimated_entries = TDB_MERGE_MIN_ESTIMATED_ENTRIES; + + if (tidesdb_merge_heap_empty(heap)) + { + tidesdb_merge_heap_free(heap); + break; + } + + uint64_t new_id = atomic_fetch_add(&cf->next_sstable_id, 1); + char path[MAX_FILE_PATH_LENGTH]; + snprintf(path, sizeof(path), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d" TDB_LEVEL_PARTITION_PREFIX "%d", + cf->directory, output_level + 1, shard); + + tidesdb_sstable_t *new_sst = tidesdb_sstable_create(cf->db, path, new_id, &cf->config); + if (!new_sst) + { + tidesdb_merge_heap_free(heap); + break; + } + + block_manager_t *klog_bm = NULL; + block_manager_t *vlog_bm = NULL; + if (tidesdb_bm_open(cf->db, &klog_bm, new_sst->klog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0 || + tidesdb_bm_open(cf->db, &vlog_bm, new_sst->vlog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + if (klog_bm) block_manager_close(klog_bm); + if (vlog_bm) block_manager_close(vlog_bm); + tidesdb_sstable_unref(cf->db, new_sst); + tidesdb_merge_heap_free(heap); + aborted = 1; + break; + } + + bloom_filter_t *bloom = NULL; + tidesdb_block_index_t *block_indexes = NULL; + + if (new_sst->config->enable_bloom_filter) + { + if (bloom_filter_new(&bloom, new_sst->config->bloom_fpr, (int)estimated_entries) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Bloom filter created (estimated entries: %" PRIu64 ")", + estimated_entries); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Bloom filter creation failed"); + bloom = NULL; + } + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Bloom filter disabled"); + } + + if (new_sst->config->enable_block_indexes && !cf->config.use_btree) + { + block_indexes = compact_block_index_create(estimated_entries, + new_sst->config->block_index_prefix_len, + comparator_fn, comparator_ctx); + if (block_indexes) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Block indexes created"); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Block indexes builder creation failed"); + } + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Block indexes disabled"); + } + + /* we branch to btree output if use_btree is enabled */ + if (cf->config.use_btree) + { + int btree_result = tidesdb_sstable_write_from_heap_btree( + cf, new_sst, heap, klog_bm, vlog_bm, bloom, sstables_to_delete, is_largest_level); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + tidesdb_merge_heap_free(heap); + + if (btree_result != TDB_SUCCESS) + { + /* mark so sstable_free unlinks the partial klog/vlog files */ + atomic_store_explicit(&new_sst->marked_for_deletion, 1, memory_order_release); + tidesdb_sstable_unref(cf->db, new_sst); + aborted = 1; + break; + } + + bloom = NULL; + goto merge_complete; + } + + tidesdb_klog_block_t *current_klog_block = tidesdb_klog_block_create(); + + uint64_t klog_block_num = 0; + uint64_t vlog_block_num = 0; + uint64_t max_seq = 0; + + /* we track first and last key of current block for block index */ + uint8_t *block_first_key = NULL; + size_t block_first_key_size = 0; + uint8_t *block_last_key = NULL; + size_t block_last_key_size = 0; + + /* snapshot floor -- see tidesdb_sstable_write_from_heap_btree for rationale */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(cf->db); + + /**** single-step lookahead in which we buffer the pending first-for-key entry so a + *** put+single-delete pair detected in the same merge input cancels together + ** at any level instead of carrying the tombstone forward. same-key dedup, + * largest-level tombstone drop, and ttl drop fire when pending resolves. */ + tidesdb_kv_pair_t *pending = NULL; + int pending_is_single_delete = 0; + int pending_sd_paired_with_put = 0; + + /* merge using heap */ + while (!tidesdb_merge_heap_empty(heap) || pending != NULL) + { + if (tidesdb_cf_abort_requested(cf)) + { + aborted = 1; + break; + } + + tidesdb_kv_pair_t *kv = NULL; + + if (!tidesdb_merge_heap_empty(heap)) + { + tidesdb_sstable_t *corrupted_sst = NULL; + kv = tidesdb_merge_heap_pop(heap, &corrupted_sst); + + /* if corruption detected, add to deletion queue */ + if (corrupted_sst) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Detected corrupted SSTable %" PRIu64 ", marking for deletion", + corrupted_sst->id); + /* shared cleanup queue -- guard against concurrent shards */ + pthread_mutex_lock(&cf->compaction_commit_lock); + queue_enqueue(sstables_to_delete, corrupted_sst); + pthread_mutex_unlock(&cf->compaction_commit_lock); + } + } + + /* range filter -- this shard only writes keys in [range_start, range_end). a filtered + * key cannot pair with pending (pending is in range), matching dividing_merge. */ + if (kv) + { + if (range_start && comparator_fn(kv->key, kv->entry.key_size, range_start, + range_start_size, comparator_ctx) < 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + if (range_end && comparator_fn(kv->key, kv->entry.key_size, range_end, + range_end_size, comparator_ctx) >= 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + } + + if (kv && pending && pending->entry.key_size == kv->entry.key_size && + memcmp(pending->key, kv->key, pending->entry.key_size) == 0 && + pending->entry.seq <= min_snapshot_seq) + { + /* older same-key version -- drop silently. we record whether the + * trailing version is a live put so a pending single-delete can + * pair-cancel with it when we resolve pending. */ + if (pending_is_single_delete && !(kv->entry.flags & TDB_KV_FLAG_TOMBSTONE)) + { + pending_sd_paired_with_put = 1; + } + tidesdb_kv_pair_free(kv); + continue; + } + + /* new key arrived (or heap exhausted) -- decide the fate of pending */ + if (pending) + { + const int sd_pair_drop = pending_is_single_delete && pending_sd_paired_with_put; + const int tombstone_drop = (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) && + is_largest_level && + pending->entry.seq <= min_snapshot_seq; + const int ttl_drop = + pending->entry.ttl > 0 && + pending->entry.ttl < + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed); + + if (!sd_pair_drop && !tombstone_drop && !ttl_drop) + { + if (pending->entry.value_size >= cf->config.klog_value_threshold && + pending->value) + { + /* we write value directly to vlog */ + uint8_t *final_data = pending->value; + size_t final_size = pending->entry.value_size; + uint8_t *compressed = NULL; + + if (new_sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = compress_data(pending->value, pending->entry.value_size, + &compressed_size, + new_sst->config->compression_algorithm); + if (compressed) + { + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *vlog_block = + block_manager_block_create(final_size, final_data); + if (vlog_block) + { + int64_t block_offset = block_manager_block_write(vlog_bm, vlog_block); + if (block_offset >= 0) + { + pending->entry.vlog_offset = (uint64_t)block_offset; + vlog_block_num++; + } + block_manager_block_release(vlog_block); + } + free(compressed); + } + + /* we check if this is the first entry in a new block */ + int is_first_entry_in_block = (current_klog_block->num_entries == 0); + + tidesdb_klog_block_add_entry(current_klog_block, pending, &cf->config, + comparator_fn, comparator_ctx); + + /* we track first key of block */ + if (is_first_entry_in_block) + { + free(block_first_key); + block_first_key = malloc(pending->entry.key_size); + if (block_first_key) + { + memcpy(block_first_key, pending->key, pending->entry.key_size); + block_first_key_size = pending->entry.key_size; + } + } + + /* we always update last key of block */ + free(block_last_key); + block_last_key = malloc(pending->entry.key_size); + if (block_last_key) + { + memcpy(block_last_key, pending->key, pending->entry.key_size); + block_last_key_size = pending->entry.key_size; + } + + if (tidesdb_klog_block_is_full(current_klog_block, TDB_KLOG_BLOCK_SIZE)) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(current_klog_block, &klog_data, + &klog_size) == 0) + { + uint8_t *final_data = klog_data; + size_t final_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = + compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *klog_block = + block_manager_block_create(final_size, final_data); + if (klog_block) + { + uint64_t block_file_position = + atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, klog_block); + block_manager_block_release(klog_block); + + if (block_indexes && block_first_key && block_last_key) + { + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, + block_first_key_size, + block_last_key, block_last_key_size, + block_file_position); + } + } + + klog_block_num++; + } + free(final_data); + } + + tidesdb_klog_block_reset(current_klog_block); + + free(block_first_key); + free(block_last_key); + block_first_key = NULL; + block_last_key = NULL; + } + + if (pending->entry.seq > max_seq) + { + max_seq = pending->entry.seq; + } + + if (bloom) + { + bloom_filter_add(bloom, pending->key, pending->entry.key_size); + } + + if (!new_sst->min_key) + { + new_sst->min_key = malloc(pending->entry.key_size); + if (new_sst->min_key) + { + memcpy(new_sst->min_key, pending->key, pending->entry.key_size); + new_sst->min_key_size = pending->entry.key_size; + } + } + + free(new_sst->max_key); + new_sst->max_key = malloc(pending->entry.key_size); + if (new_sst->max_key) + { + memcpy(new_sst->max_key, pending->key, pending->entry.key_size); + new_sst->max_key_size = pending->entry.key_size; + } + + new_sst->num_entries++; + if (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) new_sst->tombstone_count++; + } + + tidesdb_kv_pair_free(pending); + pending = NULL; + } + + if (!kv) break; + + pending = kv; + pending_is_single_delete = (kv->entry.flags & TDB_KV_FLAG_SINGLE_DELETE) != 0; + pending_sd_paired_with_put = 0; + } + + if (aborted) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' aborting full preemptive merge for SSTable %" PRIu64, cf->name, + new_sst->id); + if (pending) tidesdb_kv_pair_free(pending); + tidesdb_klog_block_free(current_klog_block); + free(block_first_key); + free(block_last_key); + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + tidesdb_merge_heap_free(heap); + if (klog_bm) block_manager_close(klog_bm); + if (vlog_bm) block_manager_close(vlog_bm); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + break; /* per-merge teardown happens once after the shards join */ + } + + new_sst->max_seq = max_seq; + + if (current_klog_block->num_entries > 0) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(current_klog_block, &klog_data, &klog_size) == 0) + { + uint8_t *final_data = klog_data; + size_t final_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *klog_block = + block_manager_block_create(final_size, final_data); + if (klog_block) + { + uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, klog_block); + block_manager_block_release(klog_block); + + if (block_indexes && block_first_key && block_last_key) + { + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, + block_first_key_size, block_last_key, + block_last_key_size, block_file_position); + } + } + + klog_block_num++; + } + free(final_data); + } + } + + free(block_first_key); + free(block_last_key); + + tidesdb_klog_block_free(current_klog_block); + + new_sst->num_klog_blocks = klog_block_num; + new_sst->num_vlog_blocks = vlog_block_num; + + block_manager_get_size(klog_bm, &new_sst->klog_data_end_offset); + + /* we write auxiliary structures (always write, even if empty, to maintain consistent file + * structure) */ + if (new_sst->num_entries > 0) + { + /* write index + bloom footer blobs (chunk-aware, shared helper) */ + tidesdb_sstable_write_footer_aux(new_sst, klog_bm, block_indexes, bloom, 1); + block_indexes = NULL; + bloom = NULL; + } + + /* we get file sizes before metadata write for serialization */ + uint64_t klog_size_before_metadata; + uint64_t vlog_size_before_metadata; + block_manager_get_size(klog_bm, &klog_size_before_metadata); + block_manager_get_size(vlog_bm, &vlog_size_before_metadata); + + new_sst->klog_size = klog_size_before_metadata; + new_sst->vlog_size = vlog_size_before_metadata; + + /* we write metadata block as the last block -- only if we have entries */ + uint8_t *metadata_data = NULL; + size_t metadata_size = 0; + if (new_sst->num_entries > 0 && + sstable_metadata_serialize(new_sst, &metadata_data, &metadata_size) == 0) + { + block_manager_block_t *metadata_block = + block_manager_block_create(metadata_size, metadata_data); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata_data); + } + + block_manager_get_size(klog_bm, &new_sst->klog_size); + block_manager_get_size(vlog_bm, &new_sst->vlog_size); + + tidesdb_merge_heap_free(heap); + + block_manager_escalate_fsync(klog_bm); + block_manager_escalate_fsync(vlog_bm); + + new_sst->klog_bm = klog_bm; + new_sst->vlog_bm = vlog_bm; + atomic_store(&new_sst->last_access_time, + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed)); + + /* we ensure all writes are visible before making sstable discoverable */ + atomic_thread_fence(memory_order_seq_cst); + + /****** we close write handles before adding to level + ***** readers will reopen files on-demand through tidesdb_sstable_ensure_open + **** this prevents file locking issues where readers try to open files + *** that are still open for writing + ** note -- we do not increment num_open_sstables here because we close + * immediately -- ensure_open will increment when a reader reopens */ + if (klog_bm) + { + block_manager_close(klog_bm); + new_sst->klog_bm = NULL; + } + if (vlog_bm) + { + block_manager_close(vlog_bm); + new_sst->vlog_bm = NULL; + } + + merge_complete:; + /* we save metadata for logging before potentially freeing sstable */ + const uint64_t sst_id = new_sst->id; + const uint64_t num_entries = new_sst->num_entries; + + /* drop_column_family marked us after the inner loop finished -- skip publishing the + * merged sstable; remove() drops the half-written files we already created on disk and the + * post-join teardown unrefs inputs without touching the manifest */ + if (tidesdb_cf_abort_requested(cf)) + { + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + aborted = 1; + break; + } + + /* we only add sstable if it has entries -- empty sstables cause corruption */ + if (num_entries > 0) + { + /* we reload num_levels as DCA may have changed it */ + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + /* we find the output level by level_num, not by stale array index */ + int target_level_num = output_level + 1; + int target_idx = -1; + for (int i = 0; i < num_levels; i++) + { + if (cf->levels[i]->level_num == target_level_num) + { + target_idx = i; + break; + } + } + + if (target_idx < 0 || target_idx >= num_levels) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Target level %d not found (current_num_levels=%d)", + target_level_num, num_levels); + /* the merge output cannot be published -- mark it so sstable_free + * unlinks the klog/vlog files instead of orphaning them on disk for + * recovery to find as an sstable that is not in the manifest */ + atomic_store_explicit(&new_sst->marked_for_deletion, 1, memory_order_release); + tidesdb_sstable_unref(cf->db, new_sst); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Adding merged SSTable %" PRIu64 " to level %d (array index %d)", + new_sst->id, cf->levels[target_idx]->level_num, target_idx); + /* commit serialized across shards (shared level array + manifest) */ + pthread_mutex_lock(&cf->compaction_commit_lock); + tidesdb_level_add_sstable(cf->levels[target_idx], new_sst); + tidesdb_bump_sstable_layout_version(cf); + + tidesdb_manifest_add_sstable(cf->manifest, cf->levels[target_idx]->level_num, + new_sst->id, new_sst->num_entries, + new_sst->klog_size + new_sst->vlog_size); + atomic_store(&cf->manifest->sequence, atomic_load(&cf->next_sstable_id)); + int manifest_result = tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + if (manifest_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Failed to commit manifest for new SSTable %" PRIu64 + " (error: %d)", + new_sst->id, manifest_result); + } + + /** we upload manifest to object store so replicas and cold-start nodes + * can see the new sstable before old inputs are cleaned up */ + tdb_objstore_upload_manifest(cf->db, cf); + pthread_mutex_unlock(&cf->compaction_commit_lock); + + tidesdb_sstable_unref(cf->db, new_sst); + } + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Skipping empty SSTable %" PRIu64 " (0 entries)", sst_id); + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + } + } while (0); + + if (aborted) atomic_store_explicit(&c->aborted, 1, memory_order_release); + return TDB_SUCCESS; +} + +/** + * tidesdb_targeted_merge + * merge a caller supplied set of sstables into a single output at target_level. + * inputs come pre-refed by the caller; ownership transfers to the merge so the + * cleanup queue releases the refs after the new sstable is published. the merge + * loop body is the same single-step lookahead used by full preemptive merge, with + * same-key dedup, single-delete pair-cancel, largest-level tombstone drop, and + * ttl drop preserved unchanged. + * + * @param cf the column family + * @param inputs array of sstables to merge (caller transfers ownership of refs) + * @param input_count number of input sstables + * @param min_input_level smallest 0-indexed level any input lives in + * @param max_input_level largest 0-indexed level any input lives in + * @param target_level 0-indexed level to write output to + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_targeted_merge(tidesdb_column_family_t *cf, tidesdb_sstable_t **inputs, + int input_count, int min_input_level, int max_input_level, + int target_level) +{ + if (!cf || !inputs || input_count <= 0) return TDB_ERR_INVALID_ARGS; + if (min_input_level < 0 || max_input_level < min_input_level) return TDB_ERR_INVALID_ARGS; + if (target_level < min_input_level) return TDB_ERR_INVALID_ARGS; + if (tidesdb_cf_abort_requested(cf)) return TDB_SUCCESS; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (target_level >= num_levels) return TDB_ERR_INVALID_ARGS; + + const int is_largest_level = (target_level == num_levels - 1); + + /* snapshot floor -- see tidesdb_sstable_write_from_heap_btree for rationale */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(cf->db); + + TDB_DEBUG_LOG( + TDB_LOG_INFO, "Starting targeted merge on CF '%s', %d inputs across levels %d..%d into %d", + cf->name, input_count, min_input_level + 1, max_input_level + 1, target_level + 1); + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + + tidesdb_merge_heap_t *heap = tidesdb_merge_heap_create(comparator_fn, comparator_ctx); + if (!heap) return TDB_ERR_MEMORY; + + queue_t *sstables_to_delete = queue_new(); + if (!sstables_to_delete) + { + tidesdb_merge_heap_free(heap); + return TDB_ERR_MEMORY; + } + + if (cf->db->object_store && cf->config.object_prefetch_compaction) + { + tdb_objstore_prefetch_sstables(cf->db, inputs, input_count); + } + + tidesdb_add_ssts_to_merge_heap(cf->db, cf, inputs, input_count, heap, sstables_to_delete); + + uint64_t new_id = atomic_fetch_add(&cf->next_sstable_id, 1); + char path[MAX_FILE_PATH_LENGTH]; + snprintf(path, sizeof(path), "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d", cf->directory, + target_level + 1); + + tidesdb_sstable_t *new_sst = tidesdb_sstable_create(cf->db, path, new_id, &cf->config); + if (!new_sst) + { + tidesdb_merge_heap_free(heap); + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, min_input_level, max_input_level); + queue_free(sstables_to_delete); + return TDB_ERR_MEMORY; + } + + block_manager_t *klog_bm = NULL; + block_manager_t *vlog_bm = NULL; + + if (block_manager_open(&klog_bm, new_sst->klog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + /* mark so sstable_free unlinks any klog file the failed open created */ + atomic_store_explicit(&new_sst->marked_for_deletion, 1, memory_order_release); + tidesdb_sstable_unref(cf->db, new_sst); + tidesdb_merge_heap_free(heap); + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, min_input_level, max_input_level); + queue_free(sstables_to_delete); + return TDB_ERR_IO; + } + + if (block_manager_open(&vlog_bm, new_sst->vlog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + block_manager_close(klog_bm); + /* mark so sstable_free unlinks the klog file the successful open created */ + atomic_store_explicit(&new_sst->marked_for_deletion, 1, memory_order_release); + tidesdb_sstable_unref(cf->db, new_sst); + tidesdb_merge_heap_free(heap); + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, min_input_level, max_input_level); + queue_free(sstables_to_delete); + return TDB_ERR_IO; + } + + /* sum from the input list directly rather than rescanning levels */ + uint64_t estimated_entries = 0; + for (int i = 0; i < input_count; i++) + { + if (inputs[i]) estimated_entries += inputs[i]->num_entries; + } + if (estimated_entries < TDB_MERGE_MIN_ESTIMATED_ENTRIES) + estimated_entries = TDB_MERGE_MIN_ESTIMATED_ENTRIES; + + bloom_filter_t *bloom = NULL; + tidesdb_block_index_t *block_indexes = NULL; + + if (new_sst->config->enable_bloom_filter) + { + if (bloom_filter_new(&bloom, new_sst->config->bloom_fpr, (int)estimated_entries) != 0) + { + bloom = NULL; + } + } + + if (new_sst->config->enable_block_indexes && !cf->config.use_btree) + { + block_indexes = + compact_block_index_create(estimated_entries, new_sst->config->block_index_prefix_len, + comparator_fn, comparator_ctx); + } + + if (cf->config.use_btree) + { + int btree_result = tidesdb_sstable_write_from_heap_btree( + cf, new_sst, heap, klog_bm, vlog_bm, bloom, sstables_to_delete, is_largest_level); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + tidesdb_merge_heap_free(heap); + + if (btree_result != TDB_SUCCESS) + { + /* mark so sstable_free unlinks the partial klog/vlog files */ + atomic_store_explicit(&new_sst->marked_for_deletion, 1, memory_order_release); + tidesdb_sstable_unref(cf->db, new_sst); + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, min_input_level, + max_input_level); + queue_free(sstables_to_delete); + return btree_result; + } + + bloom = NULL; + goto merge_complete; + } + + tidesdb_klog_block_t *current_klog_block = tidesdb_klog_block_create(); + + uint64_t klog_block_num = 0; + uint64_t vlog_block_num = 0; + uint64_t max_seq = 0; + + uint8_t *block_first_key = NULL; + size_t block_first_key_size = 0; + uint8_t *block_last_key = NULL; + size_t block_last_key_size = 0; + + tidesdb_kv_pair_t *pending = NULL; + int pending_is_single_delete = 0; + int pending_sd_paired_with_put = 0; + int aborted = 0; + + while (!tidesdb_merge_heap_empty(heap) || pending != NULL) + { + if (tidesdb_cf_abort_requested(cf)) + { + aborted = 1; + break; + } + + tidesdb_kv_pair_t *kv = NULL; + + if (!tidesdb_merge_heap_empty(heap)) + { + tidesdb_sstable_t *corrupted_sst = NULL; + kv = tidesdb_merge_heap_pop(heap, &corrupted_sst); + + if (corrupted_sst) + { + queue_enqueue(sstables_to_delete, corrupted_sst); + } + } + + if (kv && pending && pending->entry.key_size == kv->entry.key_size && + memcmp(pending->key, kv->key, pending->entry.key_size) == 0 && + pending->entry.seq <= min_snapshot_seq) + { + if (pending_is_single_delete && !(kv->entry.flags & TDB_KV_FLAG_TOMBSTONE)) + { + pending_sd_paired_with_put = 1; + } + tidesdb_kv_pair_free(kv); + continue; + } + + if (pending) + { + const int sd_pair_drop = pending_is_single_delete && pending_sd_paired_with_put; + const int tombstone_drop = (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) && + is_largest_level && pending->entry.seq <= min_snapshot_seq; + const int ttl_drop = + pending->entry.ttl > 0 && + pending->entry.ttl < + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed); + + if (!sd_pair_drop && !tombstone_drop && !ttl_drop) + { + if (pending->entry.value_size >= cf->config.klog_value_threshold && pending->value) + { + uint8_t *final_data = pending->value; + size_t final_size = pending->entry.value_size; + uint8_t *compressed = NULL; + + if (new_sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = + compress_data(pending->value, pending->entry.value_size, + &compressed_size, new_sst->config->compression_algorithm); + if (compressed) + { + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *vlog_block = + block_manager_block_create(final_size, final_data); + if (vlog_block) + { + int64_t block_offset = block_manager_block_write(vlog_bm, vlog_block); + if (block_offset >= 0) + { + pending->entry.vlog_offset = (uint64_t)block_offset; + vlog_block_num++; + } + block_manager_block_release(vlog_block); + } + free(compressed); + } + + int is_first_entry_in_block = (current_klog_block->num_entries == 0); + + tidesdb_klog_block_add_entry(current_klog_block, pending, &cf->config, + comparator_fn, comparator_ctx); + + if (is_first_entry_in_block) + { + free(block_first_key); + block_first_key = malloc(pending->entry.key_size); + if (block_first_key) + { + memcpy(block_first_key, pending->key, pending->entry.key_size); + block_first_key_size = pending->entry.key_size; + } + } + + free(block_last_key); + block_last_key = malloc(pending->entry.key_size); + if (block_last_key) + { + memcpy(block_last_key, pending->key, pending->entry.key_size); + block_last_key_size = pending->entry.key_size; + } + + if (tidesdb_klog_block_is_full(current_klog_block, TDB_KLOG_BLOCK_SIZE)) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(current_klog_block, &klog_data, &klog_size) == + 0) + { + uint8_t *final_data = klog_data; + size_t final_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = + compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *klog_block = + block_manager_block_create(final_size, final_data); + if (klog_block) + { + uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, klog_block); + block_manager_block_release(klog_block); + + if (block_indexes && block_first_key && block_last_key) + { + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add( + block_indexes, block_first_key, block_first_key_size, + block_last_key, block_last_key_size, block_file_position); + } + } + + klog_block_num++; + } + free(final_data); + } + + tidesdb_klog_block_reset(current_klog_block); + + free(block_first_key); + free(block_last_key); + block_first_key = NULL; + block_last_key = NULL; + } + + if (pending->entry.seq > max_seq) max_seq = pending->entry.seq; + + if (bloom) bloom_filter_add(bloom, pending->key, pending->entry.key_size); + + if (!new_sst->min_key) + { + new_sst->min_key = malloc(pending->entry.key_size); + if (new_sst->min_key) + { + memcpy(new_sst->min_key, pending->key, pending->entry.key_size); + new_sst->min_key_size = pending->entry.key_size; + } + } + + free(new_sst->max_key); + new_sst->max_key = malloc(pending->entry.key_size); + if (new_sst->max_key) + { + memcpy(new_sst->max_key, pending->key, pending->entry.key_size); + new_sst->max_key_size = pending->entry.key_size; + } + + new_sst->num_entries++; + if (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) new_sst->tombstone_count++; + } + + tidesdb_kv_pair_free(pending); + pending = NULL; + } + + if (!kv) break; + + pending = kv; + pending_is_single_delete = (kv->entry.flags & TDB_KV_FLAG_SINGLE_DELETE) != 0; + pending_sd_paired_with_put = 0; + } + + if (aborted) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' aborting targeted merge for SSTable %" PRIu64, + cf->name, new_sst->id); + if (pending) tidesdb_kv_pair_free(pending); + tidesdb_klog_block_free(current_klog_block); + free(block_first_key); + free(block_last_key); + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + tidesdb_merge_heap_free(heap); + if (klog_bm) block_manager_close(klog_bm); + if (vlog_bm) block_manager_close(vlog_bm); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + while (!queue_is_empty(sstables_to_delete)) + { + tidesdb_sstable_t *sst = queue_dequeue(sstables_to_delete); + if (sst) tidesdb_sstable_unref(cf->db, sst); + } + queue_free(sstables_to_delete); + return TDB_SUCCESS; + } + + new_sst->max_seq = max_seq; + + if (current_klog_block->num_entries > 0) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(current_klog_block, &klog_data, &klog_size) == 0) + { + uint8_t *final_data = klog_data; + size_t final_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *klog_block = block_manager_block_create(final_size, final_data); + if (klog_block) + { + uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, klog_block); + block_manager_block_release(klog_block); + + if (block_indexes && block_first_key && block_last_key) + { + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, + block_first_key_size, block_last_key, + block_last_key_size, block_file_position); + } + } + + klog_block_num++; + } + free(final_data); + } + } + + free(block_first_key); + free(block_last_key); + + tidesdb_klog_block_free(current_klog_block); + + new_sst->num_klog_blocks = klog_block_num; + new_sst->num_vlog_blocks = vlog_block_num; + + block_manager_get_size(klog_bm, &new_sst->klog_data_end_offset); + + if (new_sst->num_entries > 0) + { + /* write index + bloom footer blobs (chunk-aware, shared helper) */ + tidesdb_sstable_write_footer_aux(new_sst, klog_bm, block_indexes, bloom, 1); + block_indexes = NULL; /* ownership transferred; local must not double-free on abort */ + bloom = NULL; /* same as block_indexes */ + } + + uint64_t klog_size_before_metadata; + uint64_t vlog_size_before_metadata; + block_manager_get_size(klog_bm, &klog_size_before_metadata); + block_manager_get_size(vlog_bm, &vlog_size_before_metadata); + + new_sst->klog_size = klog_size_before_metadata; + new_sst->vlog_size = vlog_size_before_metadata; + + uint8_t *metadata_data = NULL; + size_t metadata_size = 0; + if (new_sst->num_entries > 0 && + sstable_metadata_serialize(new_sst, &metadata_data, &metadata_size) == 0) + { + block_manager_block_t *metadata_block = + block_manager_block_create(metadata_size, metadata_data); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata_data); + } + + block_manager_get_size(klog_bm, &new_sst->klog_size); + block_manager_get_size(vlog_bm, &new_sst->vlog_size); + + tidesdb_merge_heap_free(heap); + + block_manager_escalate_fsync(klog_bm); + block_manager_escalate_fsync(vlog_bm); + + new_sst->klog_bm = klog_bm; + new_sst->vlog_bm = vlog_bm; + atomic_store(&new_sst->last_access_time, + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed)); + + atomic_thread_fence(memory_order_seq_cst); + + if (klog_bm) + { + block_manager_close(klog_bm); + new_sst->klog_bm = NULL; + } + if (vlog_bm) + { + block_manager_close(vlog_bm); + new_sst->vlog_bm = NULL; + } + +merge_complete:; + const uint64_t sst_id = new_sst->id; + const uint64_t num_entries = new_sst->num_entries; + + if (tidesdb_cf_abort_requested(cf)) + { + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + while (!queue_is_empty(sstables_to_delete)) + { + tidesdb_sstable_t *sst = queue_dequeue(sstables_to_delete); + if (sst) tidesdb_sstable_unref(cf->db, sst); + } + queue_free(sstables_to_delete); + return TDB_SUCCESS; + } + + if (num_entries > 0) + { + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + int target_level_num = target_level + 1; + int target_idx = -1; + for (int i = 0; i < num_levels; i++) + { + if (cf->levels[i]->level_num == target_level_num) + { + target_idx = i; + break; + } + } + + if (target_idx < 0 || target_idx >= num_levels) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Targeted merge target level %d not found", + target_level_num); + /* the merge output cannot be published -- mark it so sstable_free + * unlinks the klog/vlog files instead of orphaning them on disk for + * recovery to find as an sstable that is not in the manifest */ + atomic_store_explicit(&new_sst->marked_for_deletion, 1, memory_order_release); + tidesdb_sstable_unref(cf->db, new_sst); + } + else + { + tidesdb_level_add_sstable(cf->levels[target_idx], new_sst); + tidesdb_bump_sstable_layout_version(cf); + + tidesdb_manifest_add_sstable(cf->manifest, cf->levels[target_idx]->level_num, + new_sst->id, new_sst->num_entries, + new_sst->klog_size + new_sst->vlog_size); + atomic_store(&cf->manifest->sequence, atomic_load(&cf->next_sstable_id)); + tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + tdb_objstore_upload_manifest(cf->db, cf); + + tidesdb_sstable_unref(cf->db, new_sst); + } + } + else + { + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + } + + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, min_input_level, max_input_level); + queue_free(sstables_to_delete); + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Targeted merge completed for CF '%s', wrote SSTable %" PRIu64 " (%" PRIu64 + " entries) to level %d", + cf->name, sst_id, num_entries, target_level + 1); + + return TDB_SUCCESS; +} + +/** + * tidesdb_dividing_merge_ctx_t + * shared read-only context for a dividing merge's parallel partition sub-merges. each partition + * is a disjoint key range with its own heap and output sstable, so the only shared mutation is the + * commit (level add + manifest), which the worker serializes on cf->compaction_commit_lock. + */ +typedef struct +{ + tidesdb_column_family_t *cf; + int target_level; + int is_largest_level; + int num_boundaries; + uint8_t **file_boundaries; + size_t *boundary_sizes; + tidesdb_sstable_t **del_snap; + size_t del_snap_count; + skip_list_comparator_fn comparator_fn; + void *comparator_ctx; + uint64_t partition_estimated_entries; + uint64_t min_snapshot_seq; + _Atomic(int) aborted; +} tidesdb_dividing_merge_ctx_t; + +static int tidesdb_dividing_merge_partition(void *vctx, int partition); + +/** + * tidesdb_dividing_merge + * dividing merge into level X and partition based on largest level boundaries + * @param cf column family + * @param target_level target level + * @return 0 on success, negative on failure + */ +static int tidesdb_dividing_merge(tidesdb_column_family_t *cf, int target_level) +{ + if (tidesdb_cf_abort_requested(cf)) return TDB_SUCCESS; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + if (target_level >= num_levels || target_level < 0) + { + return TDB_ERR_INVALID_ARGS; + } + + /* snapshot floor -- see tidesdb_sstable_write_from_heap_btree for rationale */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(cf->db); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Starting dividing merge for CF '%s', target_level=%d", cf->name, + target_level + 1); + + if (target_level >= num_levels - 1) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Target level %d is the largest level, need to add new level before merge", + target_level + 1); + + /*** we ensure there's a level to merge into */ + if (target_level + 1 >= num_levels) + { + const int add_result = tidesdb_add_level(cf); + if (add_result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to add level before merge, error: %d", + add_result); + return add_result; + } + + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Added level, now have %d levels", num_levels); + } + + return tidesdb_full_preemptive_merge(cf, 0, target_level, target_level); + } + + tidesdb_level_t *target = cf->levels[target_level]; + /** dividing merge + * we use boundaries from target_level+1 (the level we're merging into) */ + tidesdb_level_t *next_level = cf->levels[target_level + 1]; + + tidesdb_level_update_boundaries(target, next_level); + + int next_level_num_ssts = atomic_load_explicit(&next_level->num_sstables, memory_order_acquire); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Next level (L%d) has %d SSTables", next_level->level_num, + next_level_num_ssts); + tidesdb_sstable_t **next_level_ssts = + atomic_load_explicit(&next_level->sstables, memory_order_acquire); + for (int i = 0; i < next_level_num_ssts; i++) + { + const tidesdb_sstable_t *sst = next_level_ssts[i]; + if (sst) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Next level SSTable %" PRIu64 " (min_key_size=%zu, max_key_size=%zu)", + sst->id, sst->min_key_size, sst->max_key_size); + } + } + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + + queue_t *sstables_to_delete = queue_new(); + if (!sstables_to_delete) return TDB_ERR_MEMORY; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Snapshotting SSTable IDs from levels 1-%d", target_level + 1); + queue_t *sstable_ids_snapshot = tidesdb_snapshot_sst_ids(cf, 0, target_level); + if (!sstable_ids_snapshot) + { + queue_free(sstables_to_delete); + return TDB_ERR_MEMORY; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Collecting SSTables from levels 1-%d", target_level + 1); + tidesdb_sstable_t **ssts_array = NULL; + int sst_count = 0; + const int collect_result = tidesdb_collect_ssts_from_snapshot( + cf, 0, target_level, sstable_ids_snapshot, &ssts_array, &sst_count); + if (collect_result != TDB_SUCCESS) + { + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return collect_result; + } + + /* we prefetch input sstables before partition loop */ + if (cf->db->object_store && cf->config.object_prefetch_compaction) + { + tdb_objstore_prefetch_sstables(cf->db, ssts_array, sst_count); + } + + for (int i = 0; i < sst_count; i++) + { + tidesdb_sstable_t *sst = ssts_array[i]; + TDB_DEBUG_LOG(TDB_LOG_INFO, + "collecting SSTable %" PRIu64 " (min_key_size=%zu, max_key_size=%zu)", + sst->id, sst->min_key_size, sst->max_key_size); + queue_enqueue(sstables_to_delete, sst); + } + free(ssts_array); + + /* we get partition boundaries from target level */ + target = cf->levels[target_level]; + int num_boundaries = atomic_load_explicit(&target->num_boundaries, memory_order_acquire); + uint8_t **file_boundaries = + atomic_load_explicit(&target->file_boundaries, memory_order_acquire); + size_t *boundary_sizes = atomic_load_explicit(&target->boundary_sizes, memory_order_acquire); + + /* we get number of sstables being merged */ + size_t num_sstables_to_merge = queue_size(sstables_to_delete); + + /* if no boundaries, do a simple full merge */ + if (num_boundaries == 0) + { + int result = tidesdb_full_preemptive_merge(cf, 0, target_level, target_level); + + while (!queue_is_empty(sstables_to_delete)) + { + tidesdb_sstable_t *sst = queue_dequeue(sstables_to_delete); + if (sst) tidesdb_sstable_unref(cf->db, sst); + } + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + + return result; + } + + /* snapshot sstables_to_delete into an array once for O(1) indexed access */ + tidesdb_sstable_t **del_snap = malloc(num_sstables_to_merge * sizeof(tidesdb_sstable_t *)); + if (!del_snap) + { + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return TDB_ERR_MEMORY; + } + const size_t del_snap_count = + queue_snapshot(sstables_to_delete, (void **)del_snap, num_sstables_to_merge); + + /* we calculate total estimated entries from all ssts being merged */ + uint64_t total_estimated_entries = 0; + for (size_t i = 0; i < del_snap_count; i++) + { + if (del_snap[i]) + { + total_estimated_entries += del_snap[i]->num_entries; + } + } + + /* partitioned merge creates one sstable per partition */ + int num_partitions = num_boundaries + 1; + + /* a tombstone can be reaped only when no older data exists below the merge + * output -- i.e. every level deeper than this merge's deepest input is + * empty. normally a dividing merge targets level X < L and this is false, + * but in a small tree the dividing merge is effectively the largest-level + * merge and the tombstones must drop or they accumulate forever. */ + int dm_num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + int is_largest_level = 1; + for (int dl = target_level + 1; dl < dm_num_levels; dl++) + { + if (cf->levels[dl] && + atomic_load_explicit(&cf->levels[dl]->num_sstables, memory_order_acquire) > 0) + { + is_largest_level = 0; + break; + } + } + + /* we estimate entries per partition (divide total by number of partitions) */ + uint64_t partition_estimated_entries = total_estimated_entries / num_partitions; + if (partition_estimated_entries < TDB_MERGE_MIN_ESTIMATED_ENTRIES) + partition_estimated_entries = TDB_MERGE_MIN_ESTIMATED_ENTRIES; + + int aborted = 0; + + tidesdb_dividing_merge_ctx_t dctx; + dctx.cf = cf; + dctx.target_level = target_level; + dctx.is_largest_level = is_largest_level; + dctx.num_boundaries = num_boundaries; + dctx.file_boundaries = file_boundaries; + dctx.boundary_sizes = boundary_sizes; + dctx.del_snap = del_snap; + dctx.del_snap_count = del_snap_count; + dctx.comparator_fn = comparator_fn; + dctx.comparator_ctx = comparator_ctx; + dctx.partition_estimated_entries = partition_estimated_entries; + dctx.min_snapshot_seq = min_snapshot_seq; + atomic_init(&dctx.aborted, 0); + + /* run the partition sub-merges across the sub-compaction helper pool (the calling thread + * participates too); each partition commits its own output under cf->compaction_commit_lock */ + tidesdb_run_subcompactions(cf->db, &dctx, tidesdb_dividing_merge_partition, num_partitions); + + if (atomic_load_explicit(&dctx.aborted, memory_order_acquire)) aborted = 1; + + free(del_snap); + + if (aborted) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' aborting dividing merge", cf->name); + while (!queue_is_empty(sstables_to_delete)) + { + tidesdb_sstable_t *sst = queue_dequeue(sstables_to_delete); + if (sst) tidesdb_sstable_unref(cf->db, sst); + } + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return TDB_SUCCESS; + } + + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, 0, target_level); + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Completed dividing merge for CF '%s'", cf->name); + return TDB_SUCCESS; +} + +/** + * tidesdb_dividing_merge_partition + * one partition's sub-merge for tidesdb_dividing_merge. body is the original serial partition + * loop iteration, wrapped in do/while(0)-- a top-level continue still skips this partition and the + * abort break still bails. shared context arrives via vctx; the commit section is serialized on + * cf->compaction_commit_lock. + */ +static int tidesdb_dividing_merge_partition(void *vctx, int partition) +{ + tidesdb_dividing_merge_ctx_t *c = (tidesdb_dividing_merge_ctx_t *)vctx; + tidesdb_column_family_t *cf = c->cf; + const int target_level = c->target_level; + const int is_largest_level = c->is_largest_level; + const int num_boundaries = c->num_boundaries; + uint8_t **file_boundaries = c->file_boundaries; + size_t *boundary_sizes = c->boundary_sizes; + tidesdb_sstable_t **del_snap = c->del_snap; + const size_t del_snap_count = c->del_snap_count; + skip_list_comparator_fn comparator_fn = c->comparator_fn; + void *comparator_ctx = c->comparator_ctx; + uint64_t partition_estimated_entries = c->partition_estimated_entries; + const uint64_t min_snapshot_seq = c->min_snapshot_seq; + int aborted = 0; + + do + { + if (tidesdb_cf_abort_requested(cf)) + { + aborted = 1; + break; + } + + /* we create separate heap for this partition to avoid data loss */ + tidesdb_merge_heap_t *partition_heap = + tidesdb_merge_heap_create(comparator_fn, comparator_ctx); + if (!partition_heap) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create heap for partition %d", partition); + continue; + } + + /* we determine key range for this partition */ + uint8_t *range_start = (partition > 0) ? file_boundaries[partition - 1] : NULL; + size_t range_start_size = (partition > 0) ? boundary_sizes[partition - 1] : 0; + uint8_t *range_end = (partition < num_boundaries) ? file_boundaries[partition] : NULL; + size_t range_end_size = (partition < num_boundaries) ? boundary_sizes[partition] : 0; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Partition %d range [start_size=%zu, end_size=%zu)", partition, + range_start_size, range_end_size); + + /* we add only overlapping sstables to this partitions heap */ + uint64_t partition_entries = 0; + for (size_t i = 0; i < del_snap_count; i++) + { + tidesdb_sstable_t *sst = del_snap[i]; + if (!sst) continue; + + /* we check if this sstable overlaps with partition range */ + int overlaps = 1; + + if (range_start && comparator_fn(sst->max_key, sst->max_key_size, range_start, + range_start_size, comparator_ctx) < 0) + { + overlaps = 0; /* sst is entirely before partition */ + } + + if (overlaps && range_end && + comparator_fn(sst->min_key, sst->min_key_size, range_end, range_end_size, + comparator_ctx) >= 0) + { + overlaps = 0; /* sst is entirely after partition */ + } + + if (overlaps) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partition %d SSTable %" PRIu64 + " overlaps (min_key_size=%zu, max_key_size=%zu)", + partition, sst->id, sst->min_key_size, sst->max_key_size); + tidesdb_merge_source_t *source = tidesdb_merge_source_from_sstable(cf->db, sst); + if (source) + { + if (source->current_kv) + { + if (tidesdb_merge_heap_add_source(partition_heap, source) == TDB_SUCCESS) + { + partition_entries += sst->num_entries; + } + else + { + tidesdb_merge_source_free(source); + } + } + else + { + tidesdb_merge_source_free(source); + } + } + } + } + + if (partition_estimated_entries < TDB_MERGE_MIN_ESTIMATED_ENTRIES) + partition_estimated_entries = TDB_MERGE_MIN_ESTIMATED_ENTRIES; + + if (tidesdb_merge_heap_empty(partition_heap)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partition %d skipping empty partition (no overlapping SSTables)", + partition); + tidesdb_merge_heap_free(partition_heap); + continue; + } + + /* we create new sst for this partition with partition naming */ + uint64_t sst_id = atomic_fetch_add(&cf->next_sstable_id, 1); + char sst_path[MAX_FILE_PATH_LENGTH]; + snprintf(sst_path, sizeof(sst_path), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d" TDB_LEVEL_PARTITION_PREFIX "%d", + cf->directory, target_level + 1, partition); + + tidesdb_sstable_t *new_sst = tidesdb_sstable_create(cf->db, sst_path, sst_id, &cf->config); + if (!new_sst) + { + tidesdb_merge_heap_free(partition_heap); + continue; + } + + block_manager_t *klog_bm = NULL; + block_manager_t *vlog_bm = NULL; + + if (block_manager_open(&klog_bm, new_sst->klog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + tidesdb_merge_heap_free(partition_heap); + tidesdb_sstable_unref(cf->db, new_sst); + continue; + } + + if (block_manager_open(&vlog_bm, new_sst->vlog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + block_manager_close(klog_bm); + tidesdb_merge_heap_free(partition_heap); + tidesdb_sstable_unref(cf->db, new_sst); + continue; + } + + /* we merge keys in this partition's range */ + tidesdb_klog_block_t *klog_block = tidesdb_klog_block_create(); + + uint64_t entry_count = 0; + uint64_t tombstone_count = 0; + uint64_t klog_block_num = 0; + uint64_t vlog_block_num = 0; + uint64_t max_seq = 0; + uint8_t *first_key = NULL; + size_t first_key_size = 0; + uint8_t *last_key = NULL; + size_t last_key_size = 0; + + bloom_filter_t *bloom = NULL; + tidesdb_block_index_t *block_indexes = NULL; + + /* we track first and last key of current block for block index */ + uint8_t *block_first_key = NULL; + size_t block_first_key_size = 0; + uint8_t *block_last_key = NULL; + size_t block_last_key_size = 0; + + if (cf->config.enable_bloom_filter) + { + if (bloom_filter_new(&bloom, cf->config.bloom_fpr, (int)partition_entries) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partition %d bloom filter created (estimated entries: %" PRIu64 ")", + partition, partition_entries); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Partition %d bloom filter creation failed", + partition); + bloom = NULL; + } + } + + if (cf->config.enable_block_indexes && !cf->config.use_btree) + { + block_indexes = + compact_block_index_create(partition_entries, cf->config.block_index_prefix_len, + comparator_fn, comparator_ctx); + } + + /* we branch to btree output if use_btree is enabled. + * is_largest_level mirrors the non-btree branch below, a small-tree + * dividing merge whose deeper levels are all empty is the effective + * bottom, so regular tombstones must drop here or they accumulate + * forever (the same reclamation bug fixed for partitioned merge). */ + if (cf->config.use_btree) + { + tidesdb_klog_block_free(klog_block); + klog_block = NULL; + + int btree_result = tidesdb_sstable_write_from_heap_btree( + cf, new_sst, partition_heap, klog_bm, vlog_bm, bloom, NULL, is_largest_level); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + tidesdb_merge_heap_free(partition_heap); + + bloom = NULL; + + if (btree_result != TDB_SUCCESS || new_sst->num_entries == 0) + { + if (new_sst->num_entries == 0) + { + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + } + tidesdb_sstable_unref(cf->db, new_sst); + continue; + } + + /* we add the btree sstable to target level (commit serialized across partitions) */ + pthread_mutex_lock(&cf->compaction_commit_lock); + tidesdb_level_add_sstable(cf->levels[target_level], new_sst); + tidesdb_bump_sstable_layout_version(cf); + tidesdb_manifest_add_sstable(cf->manifest, cf->levels[target_level]->level_num, + new_sst->id, new_sst->num_entries, + new_sst->klog_size + new_sst->vlog_size); + pthread_mutex_unlock(&cf->compaction_commit_lock); + tidesdb_sstable_unref(cf->db, new_sst); + continue; + } + + /* single-step lookahead pretty much same pair-cancel pattern as full-preemptive merge. + * dividing merge never goes to the largest level so there's no + * tombstone-at-largest-level drop here, only ttl drop and single- + * delete pair-cancel. */ + tidesdb_kv_pair_t *pending = NULL; + int pending_is_single_delete = 0; + int pending_sd_paired_with_put = 0; + + /* we process entries from partition-specific heap -- filter keys by partition range */ + while (!tidesdb_merge_heap_empty(partition_heap) || pending != NULL) + { + tidesdb_kv_pair_t *kv = NULL; + + if (!tidesdb_merge_heap_empty(partition_heap)) + { + kv = tidesdb_merge_heap_pop(partition_heap, NULL); + + if (kv) + { + /* we filter keys by partition range -- merge source reads + * all keys from sst but we only want keys within this + * partition's boundaries. range-filtered keys cannot pair + * with pending because pending's key is in range. */ + if (range_start && comparator_fn(kv->key, kv->entry.key_size, range_start, + range_start_size, comparator_ctx) < 0) + { + tidesdb_kv_pair_free(kv); + kv = NULL; + continue; + } + if (range_end && comparator_fn(kv->key, kv->entry.key_size, range_end, + range_end_size, comparator_ctx) >= 0) + { + tidesdb_kv_pair_free(kv); + kv = NULL; + continue; + } + } + } + + if (kv && pending && pending->entry.key_size == kv->entry.key_size && + memcmp(pending->key, kv->key, pending->entry.key_size) == 0 && + pending->entry.seq <= min_snapshot_seq) + { + /* older same-key version -- drop silently. a pending single- + * delete pairs with a live put here and cancels on resolve. */ + if (pending_is_single_delete && !(kv->entry.flags & TDB_KV_FLAG_TOMBSTONE)) + { + pending_sd_paired_with_put = 1; + } + tidesdb_kv_pair_free(kv); + continue; + } + + /* new key arrived (or heap exhausted) -- decide the fate of pending */ + if (pending) + { + const int sd_pair_drop = pending_is_single_delete && pending_sd_paired_with_put; + /* reap a plain tombstone only when this merge reaches the + * effective bottom of the tree (no deeper level holds data) */ + const int tombstone_drop = (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) && + is_largest_level && + pending->entry.seq <= min_snapshot_seq; + const int ttl_drop = + pending->entry.ttl > 0 && + pending->entry.ttl < + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed); + + if (!sd_pair_drop && !tombstone_drop && !ttl_drop) + { + /* we add to sst */ + if (!first_key) + { + first_key = malloc(pending->entry.key_size); + if (first_key) + { + memcpy(first_key, pending->key, pending->entry.key_size); + first_key_size = pending->entry.key_size; + } + } + + free(last_key); + last_key = malloc(pending->entry.key_size); + if (last_key) + { + memcpy(last_key, pending->key, pending->entry.key_size); + last_key_size = pending->entry.key_size; + } + + if (bloom) + { + bloom_filter_add(bloom, pending->key, pending->entry.key_size); + } + + /* large values go to the output vlog -- without recording a + * fresh offset here the entry is neither inline nor in vlog + * and the klog block serializes inconsistently */ + if (pending->entry.value_size >= cf->config.klog_value_threshold && + pending->value) + { + uint8_t *final_data = pending->value; + size_t final_size = pending->entry.value_size; + uint8_t *compressed = NULL; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = + compress_data(pending->value, pending->entry.value_size, + &compressed_size, cf->config.compression_algorithm); + if (compressed) + { + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *vlog_block = + block_manager_block_create(final_size, final_data); + if (vlog_block) + { + int64_t block_offset = block_manager_block_write(vlog_bm, vlog_block); + if (block_offset >= 0) + { + pending->entry.vlog_offset = (uint64_t)block_offset; + vlog_block_num++; + } + block_manager_block_release(vlog_block); + } + free(compressed); + } + + /* we check if this is the first entry in a new block */ + int is_first_entry_in_block = (klog_block->num_entries == 0); + + tidesdb_klog_block_add_entry(klog_block, pending, &cf->config, comparator_fn, + comparator_ctx); + + /* we track first key of block */ + if (is_first_entry_in_block) + { + free(block_first_key); + block_first_key = malloc(pending->entry.key_size); + if (block_first_key) + { + memcpy(block_first_key, pending->key, pending->entry.key_size); + block_first_key_size = pending->entry.key_size; + } + } + + /* we always update last key of block */ + free(block_last_key); + block_last_key = malloc(pending->entry.key_size); + if (block_last_key) + { + memcpy(block_last_key, pending->key, pending->entry.key_size); + block_last_key_size = pending->entry.key_size; + } + + if (tidesdb_klog_block_is_full(klog_block, TDB_KLOG_BLOCK_SIZE)) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(klog_block, &klog_data, &klog_size) == 0) + { + uint8_t *final_klog_data = klog_data; + size_t final_klog_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = + compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_klog_data = compressed; + final_klog_size = compressed_size; + } + } + + block_manager_block_t *klog_bm_block = + block_manager_block_create(final_klog_size, final_klog_data); + if (klog_bm_block) + { + uint64_t block_file_position = + atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, klog_bm_block); + block_manager_block_release(klog_bm_block); + + if (block_indexes && block_first_key && block_last_key) + { + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, + block_first_key_size, + block_last_key, block_last_key_size, + block_file_position); + } + } + + klog_block_num++; + } + free(final_klog_data); + } + + tidesdb_klog_block_free(klog_block); + klog_block = tidesdb_klog_block_create(); + + /* we reset block tracking for new block */ + free(block_first_key); + free(block_last_key); + block_first_key = NULL; + block_last_key = NULL; + } + + /* we track maximum sequence number */ + if (pending->entry.seq > max_seq) + { + max_seq = pending->entry.seq; + } + + entry_count++; + if (pending->entry.flags & TDB_KV_FLAG_TOMBSTONE) tombstone_count++; + } + + tidesdb_kv_pair_free(pending); + pending = NULL; + } + + if (!kv) break; + + pending = kv; + pending_is_single_delete = (kv->entry.flags & TDB_KV_FLAG_SINGLE_DELETE) != 0; + pending_sd_paired_with_put = 0; + } + + tidesdb_merge_heap_free(partition_heap); + + /* we must write remaining klog block if it has data */ + if (klog_block->num_entries > 0) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(klog_block, &klog_data, &klog_size) == 0) + { + uint8_t *final_klog_data = klog_data; + size_t final_klog_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_klog_data = compressed; + final_klog_size = compressed_size; + } + } + + block_manager_block_t *block = + block_manager_block_create(final_klog_size, final_klog_data); + if (block) + { + /* we capture file position before writing the block */ + uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, block); + block_manager_block_release(block); + + /* we add final block to index after writing with correct file position */ + if (block_indexes && block_first_key && block_last_key) + { + /* we sample every Nth block (ratio validated to be >= 1) */ + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, + block_first_key_size, block_last_key, + block_last_key_size, block_file_position); + } + } + + klog_block_num++; + } + free(final_klog_data); + } + } + + free(block_first_key); + free(block_last_key); + + tidesdb_klog_block_free(klog_block); + + new_sst->num_klog_blocks = klog_block_num; + new_sst->num_vlog_blocks = vlog_block_num; + + new_sst->num_entries = entry_count; + new_sst->tombstone_count = tombstone_count; + new_sst->max_seq = max_seq; + new_sst->min_key = first_key; + new_sst->min_key_size = first_key_size; + new_sst->max_key = last_key; + new_sst->max_key_size = last_key_size; + + /* we capture klog file offset where data blocks end (before writing index/bloom/metadata) + */ + block_manager_get_size(klog_bm, &new_sst->klog_data_end_offset); + + /* we write auxiliary structures (always write, even if empty, to maintain consistent file + * structure) */ + if (entry_count > 0) + { + /* write index + bloom footer blobs (chunk-aware, shared helper) */ + tidesdb_sstable_write_footer_aux(new_sst, klog_bm, block_indexes, bloom, 1); + block_indexes = NULL; /* ownership transferred; local must not double-free on abort */ + bloom = NULL; /* ownership transferred; local must not double-free on abort */ + } + + /* we get file sizes before metadata write for serialization */ + uint64_t klog_size_before_metadata; + uint64_t vlog_size_before_metadata; + block_manager_get_size(klog_bm, &klog_size_before_metadata); + block_manager_get_size(vlog_bm, &vlog_size_before_metadata); + + /* we temporarily set sizes for metadata serialization */ + new_sst->klog_size = klog_size_before_metadata; + new_sst->vlog_size = vlog_size_before_metadata; + + /* we write metadata block as the last block -- only if we have entries */ + uint8_t *metadata_data = NULL; + size_t metadata_size = 0; + if (entry_count > 0 && + sstable_metadata_serialize(new_sst, &metadata_data, &metadata_size) == 0) + { + block_manager_block_t *metadata_block = + block_manager_block_create(metadata_size, metadata_data); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata_data); + } + + /* we get final file sizes after metadata write */ + block_manager_get_size(klog_bm, &new_sst->klog_size); + block_manager_get_size(vlog_bm, &new_sst->vlog_size); + + /* we keep block managers open for immediate reads, reaper will close if needed once it's + * evicted */ + new_sst->klog_bm = klog_bm; + new_sst->vlog_bm = vlog_bm; + atomic_store(&new_sst->last_access_time, + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed)); + atomic_fetch_add(&cf->db->num_open_sstables, 1); + + /* we ensure all writes are visible before making sstable discoverable */ + atomic_thread_fence(memory_order_seq_cst); + + /* we add to target level */ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Partition %d merged %" PRIu64 " entries", partition, + entry_count); + + if (entry_count > 0 && tidesdb_cf_abort_requested(cf)) + { + /* drop fired during this partition's merge; do not publish the partition output */ + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + aborted = 1; + break; + } + + if (entry_count > 0) + { + /* we reload num_levels as DCA may have changed it */ + int current_num_levels = + atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + /* we find the target level by level_num, not by stale array index */ + int target_level_num = target_level + 1; + int target_idx = -1; + for (int i = 0; i < current_num_levels; i++) + { + if (cf->levels[i]->level_num == target_level_num) + { + target_idx = i; + break; + } + } + + if (target_idx < 0 || target_idx >= current_num_levels) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Partition %d target level %d not found " + "(current_num_levels=%d)", + partition, target_level_num, current_num_levels); + tidesdb_sstable_unref(cf->db, new_sst); + } + else + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "Partition %d adding merged SSTable %" PRIu64 " to level %d (array index %d)", + partition, new_sst->id, cf->levels[target_idx]->level_num, target_idx); + /* commit serialized across partitions (shared level array + manifest) */ + pthread_mutex_lock(&cf->compaction_commit_lock); + tidesdb_level_add_sstable(cf->levels[target_idx], new_sst); + tidesdb_bump_sstable_layout_version(cf); + + tidesdb_manifest_add_sstable(cf->manifest, cf->levels[target_idx]->level_num, + new_sst->id, new_sst->num_entries, + new_sst->klog_size + new_sst->vlog_size); + atomic_store(&cf->manifest->sequence, atomic_load(&cf->next_sstable_id)); + int manifest_result = tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + if (manifest_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Partition %d failed to commit manifest for SSTable %" PRIu64 + " (error: %d)", + partition, new_sst->id, manifest_result); + } + + tdb_objstore_upload_manifest(cf->db, cf); + pthread_mutex_unlock(&cf->compaction_commit_lock); + + tidesdb_sstable_unref(cf->db, new_sst); + } + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partition %d skipping empty SSTable %" PRIu64 " (0 entries)", partition, + new_sst->id); + + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + tidesdb_sstable_unref(cf->db, new_sst); + } + } while (0); + + if (aborted) atomic_store_explicit(&c->aborted, 1, memory_order_release); + return TDB_SUCCESS; +} + +/** + * tdb_partitioned_merge_finalize_sst + * finalize an output sstable during partitioned merge. + * writes aux blocks (index, bloom, metadata), closes block managers, + * adds to target level, and commits manifest. + * used both for normal partition completion and mid-partition file_max splits. + * + * @param cf column family + * @param sst sstable to finalize (takes ownership, caller must not use after) + * @param klog_bm klog block manager (closed on return) + * @param vlog_bm vlog block manager (closed on return) + * @param bloom bloom filter (ownership transferred to sst) + * @param block_indexes block index (ownership transferred to sst) + * @param entry_count number of entries written + * @param tombstone_count number of tombstones + * @param klog_block_num number of klog blocks written + * @param vlog_block_num number of vlog blocks written + * @param max_seq maximum sequence number seen + * @param end_level 1-indexed target level number + * @param partition partition index (for logging) + * @return 0 on success, -1 on failure + */ +static int tdb_partitioned_merge_finalize_sst( + tidesdb_column_family_t *cf, tidesdb_sstable_t *sst, block_manager_t *klog_bm, + block_manager_t *vlog_bm, bloom_filter_t *bloom, tidesdb_block_index_t *block_indexes, + const uint64_t entry_count, const uint64_t tombstone_count, const uint64_t klog_block_num, + const uint64_t vlog_block_num, const uint64_t max_seq, const int end_level, const int partition) +{ + sst->num_klog_blocks = klog_block_num; + sst->num_vlog_blocks = vlog_block_num; + sst->num_entries = entry_count; + sst->tombstone_count = tombstone_count; + sst->max_seq = max_seq; + + block_manager_get_size(klog_bm, &sst->klog_data_end_offset); + + if (entry_count > 0) + { + /* write index + bloom footer blobs (chunk-aware, shared helper). ownership + * of block_indexes/bloom transfers to sst inside the helper. */ + tidesdb_sstable_write_footer_aux(sst, klog_bm, block_indexes, bloom, 1); + } + + uint64_t klog_size_before_metadata; + uint64_t vlog_size_before_metadata; + block_manager_get_size(klog_bm, &klog_size_before_metadata); + block_manager_get_size(vlog_bm, &vlog_size_before_metadata); + sst->klog_size = klog_size_before_metadata; + sst->vlog_size = vlog_size_before_metadata; + + uint8_t *metadata_data = NULL; + size_t metadata_size = 0; + if (entry_count > 0 && sstable_metadata_serialize(sst, &metadata_data, &metadata_size) == 0) + { + block_manager_block_t *metadata_block = + block_manager_block_create(metadata_size, metadata_data); + if (metadata_block) + { + block_manager_block_write(klog_bm, metadata_block); + block_manager_block_release(metadata_block); + } + free(metadata_data); + } + + block_manager_get_size(klog_bm, &sst->klog_size); + block_manager_get_size(vlog_bm, &sst->vlog_size); + + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + + atomic_thread_fence(memory_order_seq_cst); + + /* drop fired during this partition's finalize; do not publish the partial sstable */ + if (entry_count > 0 && tidesdb_cf_abort_requested(cf)) + { + remove(sst->klog_path); + remove(sst->vlog_path); + tidesdb_sstable_unref(cf->db, sst); + return 0; + } + + if (entry_count > 0) + { + int current_num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + const int target_level_num = end_level; + int target_idx = -1; + for (int i = 0; i < current_num_levels; i++) + { + if (cf->levels[i]->level_num == target_level_num) + { + target_idx = i; + break; + } + } + + if (target_idx < 0 || target_idx >= current_num_levels) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Partitioned merge partition %d, target level %d not found " + "(current_num_levels=%d), data would be lost!", + partition, target_level_num, current_num_levels); + tidesdb_sstable_unref(cf->db, sst); + return -1; + } + + /* commit serialized across partitions (shared level array + manifest); finalize is + * called from each partition sub-merge, possibly concurrently, and also mid-partition + * on a file_max split, so the lock guards every output's publish */ + pthread_mutex_lock(&cf->compaction_commit_lock); + tidesdb_level_add_sstable(cf->levels[target_idx], sst); + tidesdb_bump_sstable_layout_version(cf); + + tidesdb_manifest_add_sstable(cf->manifest, cf->levels[target_idx]->level_num, sst->id, + sst->num_entries, sst->klog_size + sst->vlog_size); + atomic_store(&cf->manifest->sequence, atomic_load(&cf->next_sstable_id)); + const int manifest_result = tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + if (manifest_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Partitioned merge partition %d failed to commit manifest for " + "SSTable %" PRIu64 " (error: %d)", + partition, sst->id, manifest_result); + } + + tdb_objstore_upload_manifest(cf->db, cf); + pthread_mutex_unlock(&cf->compaction_commit_lock); + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partitioned merge partition %d finalized SSTable %" PRIu64 " with %" PRIu64 + " entries, %" PRIu64 " klog blocks", + partition, sst->id, sst->num_entries, sst->num_klog_blocks); + tidesdb_sstable_unref(cf->db, sst); + } + else + { + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + remove(sst->klog_path); + remove(sst->vlog_path); + tidesdb_sstable_unref(cf->db, sst); + } + + return 0; +} + +/** + * tidesdb_partitioned_merge_ctx_t / _partition + * shared read-only context for a partitioned merge's parallel partition sub-merges. each partition + * is a disjoint key range with its own heap and output sstable(s); commits go through + * tdb_partitioned_merge_finalize_sst (or the inline btree path), both serialized on + * cf->compaction_commit_lock. the per-partition body is the original serial iteration wrapped in + * do/while(0) so top-level continue/break keep their meaning. + */ +typedef struct +{ + tidesdb_column_family_t *cf; + int start_idx; + int end_idx; + int end_level; + int num_partitions; + uint8_t **boundaries; + size_t *boundary_sizes; + int *partition_skipped; + size_t file_max; + int targeting_largest; + _Atomic(int) aborted; +} tidesdb_partitioned_merge_ctx_t; + +static int tidesdb_partitioned_merge_partition(void *vctx, int partition); + +static int tidesdb_partitioned_merge(tidesdb_column_family_t *cf, const int start_level, + const int end_level) +{ + if (tidesdb_cf_abort_requested(cf)) return TDB_SUCCESS; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + /* we convert 1-indexed level numbers to 0-indexed array indices */ + int start_idx = start_level - 1; + int end_idx = end_level - 1; + + if (start_idx < 0 || end_idx >= num_levels) + { + return TDB_ERR_INVALID_ARGS; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Starting partitioned merge CF '%s', levels %d->%d (array indices %d->%d)", + cf->name, start_level, end_level, start_idx, end_idx); + + tidesdb_level_t *largest = cf->levels[num_levels - 1]; + + /* we get file boundaries from largest level */ + tidesdb_sstable_t **largest_sstables = + atomic_load_explicit(&largest->sstables, memory_order_acquire); + int num_partitions = atomic_load_explicit(&largest->num_sstables, memory_order_acquire); + + /* we check if largest level is empty before collecting sstables */ + if (num_partitions == 0) + { + /* the largest level is empty, thus we fall back to full preemptive merge. + * we dont collect sstables since we're not doing partitioned merge. + * tidesdb_full_preemptive_merge expects 0-indexed array indices, not 1-indexed level + * numbers */ + + return tidesdb_full_preemptive_merge(cf, start_idx, end_idx, end_idx); + } + + queue_t *sstables_to_delete = queue_new(); + if (!sstables_to_delete) return TDB_ERR_MEMORY; + + queue_t *sstable_ids_snapshot = tidesdb_snapshot_sst_ids(cf, start_idx, end_idx); + if (!sstable_ids_snapshot) + { + queue_free(sstables_to_delete); + return TDB_ERR_MEMORY; + } + + tidesdb_sstable_t **ssts_array = NULL; + int sst_count = 0; + int collect_result = tidesdb_collect_ssts_from_snapshot( + cf, start_idx, end_idx, sstable_ids_snapshot, &ssts_array, &sst_count); + if (collect_result != TDB_SUCCESS) + { + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + return collect_result; + } + + /* we prefetch input sstables before partition loop */ + if (cf->db->object_store && cf->config.object_prefetch_compaction) + { + tdb_objstore_prefetch_sstables(cf->db, ssts_array, sst_count); + } + + uint8_t **boundaries = malloc(num_partitions * sizeof(uint8_t *)); + size_t *boundary_sizes = malloc(num_partitions * sizeof(size_t)); + + for (int i = 0; i < num_partitions; i++) + { + /* we check for null as concurrent compactions may have removed sstables */ + if (!largest_sstables[i]) + { + boundaries[i] = NULL; + boundary_sizes[i] = 0; + continue; + } + + boundaries[i] = malloc(largest_sstables[i]->min_key_size); + boundary_sizes[i] = largest_sstables[i]->min_key_size; + if (largest_sstables[i]->min_key && boundary_sizes[i] > 0) + { + memcpy(boundaries[i], largest_sstables[i]->min_key, boundary_sizes[i]); + } + } + + /**** spooky paper algorithm 2 -- when merging into the largest level, + *** cap output sstable size at file_max = C_X (capacity of the dividing level). + ** this bounds transient space-amp to 1/T. when not targeting the largest level, + * file_max is 0 which disables splitting. */ + const int targeting_largest = (end_idx == num_levels - 1); + size_t file_max = 0; + if (targeting_largest && start_idx >= 0 && start_idx < num_levels) + { + file_max = atomic_load_explicit(&cf->levels[start_idx]->capacity, memory_order_acquire); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partitioned merge targeting largest level, file_max=%zu (C_X at level %d)", + file_max, start_idx + 1); + } + + /* spooky paper 4.3 -- skew optimization. a partition whose largest-level + * file has no overlapping data at the upper merge levels would just be + * rewritten identically. we mark such partitions so the merge leaves their + * largest-level file untouched, avoiding write-amp on cold key ranges. the + * id snapshot above was taken first, so any sstable added to an upper level + * after this scan is absent from sstables_to_delete and cannot be lost. a + * NULL array (alloc failure) just disables the optimization. */ + int *partition_skipped = calloc(num_partitions, sizeof(int)); + int skipped_any = 0; + if (partition_skipped && targeting_largest && start_idx < end_idx) + { + skip_list_comparator_fn skew_cmp = NULL; + void *skew_cmp_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &skew_cmp, &skew_cmp_ctx); + + for (int p = 0; p < num_partitions; p++) + { + if (!boundaries[p]) continue; + partition_skipped[p] = 1; /* skippable until an overlapping upper file is found */ + + /* partition 0 covers everything below boundaries[1] */ + uint8_t *r_start = (p > 0) ? boundaries[p] : NULL; + size_t r_start_sz = (p > 0) ? boundary_sizes[p] : 0; + uint8_t *r_end = (p + 1 < num_partitions) ? boundaries[p + 1] : NULL; + size_t r_end_sz = (p + 1 < num_partitions) ? boundary_sizes[p + 1] : 0; + + for (int lv = start_idx; lv < end_idx && partition_skipped[p]; lv++) + { + tidesdb_level_t *lvl = cf->levels[lv]; + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + int n = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **ssts = + atomic_load_explicit(&lvl->sstables, memory_order_acquire); + for (int i = 0; i < n; i++) + { + tidesdb_sstable_t *s = ssts[i]; + if (!s) continue; + if (r_start && skew_cmp(s->max_key, s->max_key_size, r_start, r_start_sz, + skew_cmp_ctx) < 0) + continue; /* s entirely before partition */ + if (r_end && + skew_cmp(s->min_key, s->min_key_size, r_end, r_end_sz, skew_cmp_ctx) >= 0) + continue; /* s entirely after partition */ + partition_skipped[p] = 0; /* overlapping newer data -- must merge */ + break; + } + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + } + if (partition_skipped[p]) skipped_any = 1; + } + } + + /* a skipped partition's largest-level file is left untouched, so it must not + * flow through sstables_to_delete. release the collect reference for those; + * every other input sstable is queued for removal after the merge. */ + for (int i = 0; i < sst_count; i++) + { + tidesdb_sstable_t *s = ssts_array[i]; + int skewed_skip = 0; + if (partition_skipped) + { + for (int p = 0; p < num_partitions; p++) + { + if (partition_skipped[p] && largest_sstables[p] == s) + { + skewed_skip = 1; + break; + } + } + } + if (skewed_skip) + tidesdb_sstable_unref(cf->db, s); + else + queue_enqueue(sstables_to_delete, s); + } + free(ssts_array); + + int aborted = 0; + + tidesdb_partitioned_merge_ctx_t pctx; + pctx.cf = cf; + pctx.start_idx = start_idx; + pctx.end_idx = end_idx; + pctx.end_level = end_level; + pctx.num_partitions = num_partitions; + pctx.boundaries = boundaries; + pctx.boundary_sizes = boundary_sizes; + pctx.partition_skipped = partition_skipped; + pctx.file_max = file_max; + pctx.targeting_largest = targeting_largest; + atomic_init(&pctx.aborted, 0); + + /* run the partition sub-merges across the sub-compaction helper pool (calling thread works + * too); each partition commits its output(s) under cf->compaction_commit_lock */ + tidesdb_run_subcompactions(cf->db, &pctx, tidesdb_partitioned_merge_partition, num_partitions); + + if (atomic_load_explicit(&pctx.aborted, memory_order_acquire)) aborted = 1; + + if (aborted) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' aborting partitioned merge", cf->name); + while (!queue_is_empty(sstables_to_delete)) + { + tidesdb_sstable_t *sst = queue_dequeue(sstables_to_delete); + if (sst) tidesdb_sstable_unref(cf->db, sst); + } + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + for (int i = 0; i < num_partitions; i++) + { + free(boundaries[i]); + } + free(boundaries); + free(boundary_sizes); + free(partition_skipped); + return TDB_SUCCESS; + } + + tidesdb_cleanup_merged_sstables(cf, sstables_to_delete, start_idx, end_idx); + queue_free(sstables_to_delete); + tidesdb_cleanup_snapshot_ids(sstable_ids_snapshot); + + /* the skew optimization can leave the largest level out of key order + * (skipped files keep their old slots while merged partitions append) -- + * restore the ascending-min_key order the next partitioned merge relies on + * when it derives partition boundaries from this level */ + if (skipped_any) + { + skip_list_comparator_fn sort_cmp = NULL; + void *sort_cmp_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &sort_cmp, &sort_cmp_ctx); + if (sort_cmp && tidesdb_level_sort_by_min_key(cf->db, cf->levels[end_idx], sort_cmp, + sort_cmp_ctx) != TDB_SUCCESS) + { + /* the largest level is left unsorted -- the next partitioned merge will derive + * boundaries from an out-of-order array. not fatal to this merge (sstables are + * already committed), but surface it. */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' failed to re-sort level %d by min_key after partitioned merge " + "(out of memory); next merge's partition boundaries may be skewed", + cf->name, cf->levels[end_idx]->level_num); + } + } + + for (int i = 0; i < num_partitions; i++) + { + free(boundaries[i]); + } + free(boundaries); + free(boundary_sizes); + free(partition_skipped); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Partitioned merge complete for CF '%s', processed %d partitions", + cf->name, num_partitions); + + return TDB_SUCCESS; +} + +/** + * tidesdb_partitioned_merge_partition + * one partition's sub-merge for tidesdb_partitioned_merge (see ctx doc above). body is the + * original serial iteration wrapped in do/while(0). + */ +static int tidesdb_partitioned_merge_partition(void *vctx, int partition) +{ + tidesdb_partitioned_merge_ctx_t *c = (tidesdb_partitioned_merge_ctx_t *)vctx; + tidesdb_column_family_t *cf = c->cf; + const int start_idx = c->start_idx; + const int end_idx = c->end_idx; + const int end_level = c->end_level; + const int num_partitions = c->num_partitions; + uint8_t **boundaries = c->boundaries; + size_t *boundary_sizes = c->boundary_sizes; + int *partition_skipped = c->partition_skipped; + const size_t file_max = c->file_max; + const int targeting_largest = c->targeting_largest; + int aborted = 0; + + do + { + if (tidesdb_cf_abort_requested(cf)) + { + aborted = 1; + break; + } + + /* spooky 4.3 skew -- this partition's largest-level file has no + * overlapping newer data, so merging it would just rewrite it + * identically. leave it in place. */ + if (partition_skipped && partition_skipped[partition]) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partition %d/%d skipped (skew optimization -- no overlapping newer " + "data)", + partition + 1, num_partitions); + continue; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Processing partition %d/%d", partition + 1, num_partitions); + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + + tidesdb_merge_heap_t *heap = tidesdb_merge_heap_create(comparator_fn, comparator_ctx); + if (!heap) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create merge heap for partition %d", partition); + continue; + } + + /* partition 0 extends down to -infinity so merge-input keys below the + * largest level's minimum are not dropped -- matches dividing_merge */ + uint8_t *range_start = (partition > 0) ? boundaries[partition] : NULL; + size_t range_start_size = (partition > 0) ? boundary_sizes[partition] : 0; + uint8_t *range_end = (partition + 1 < num_partitions) ? boundaries[partition + 1] : NULL; + size_t range_end_size = + (partition + 1 < num_partitions) ? boundary_sizes[partition + 1] : 0; + + /* we add overlapping ssts as sources and calculate estimated entries */ + uint64_t estimated_entries = 0; + + /* we reload levels for each partition */ + for (int level_idx = start_idx; level_idx <= end_idx; level_idx++) + { + tidesdb_level_t *lvl = cf->levels[level_idx]; + + /* we hold array_readers to prevent retire_array from freeing the array + * while we iterate -- a concurrent flush on L1 can swap the array */ + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + + int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **sstables = + atomic_load_explicit(&lvl->sstables, memory_order_acquire); + + for (int i = 0; i < num_ssts; i++) + { + tidesdb_sstable_t *sst = sstables[i]; + /* we check for null as concurrent compactions may have removed sstables */ + if (!sst) continue; + + int overlaps = 1; + + if (range_start && comparator_fn(sst->max_key, sst->max_key_size, range_start, + range_start_size, comparator_ctx) < 0) + { + overlaps = 0; + } + + if (range_end && comparator_fn(sst->min_key, sst->min_key_size, range_end, + range_end_size, comparator_ctx) >= 0) + { + overlaps = 0; + } + + if (overlaps) + { + /* tidesdb_merge_source_from_sstable takes its own reference */ + tidesdb_merge_source_t *source = tidesdb_merge_source_from_sstable(cf->db, sst); + if (source) + { + if (tidesdb_merge_heap_add_source(heap, source) == TDB_SUCCESS) + { + estimated_entries += sst->num_entries; + } + else + { + /* failed to add source to heap, free it to prevent leak */ + tidesdb_merge_source_free(source); + } + } + /* if merge source creation failed, no reference was taken, nothing to clean up + */ + } + /* if sstable doesnt overlap, we dont need to do anything */ + } + + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + } + + if (estimated_entries < TDB_MERGE_MIN_ESTIMATED_ENTRIES) + estimated_entries = TDB_MERGE_MIN_ESTIMATED_ENTRIES; + + /* we create output sst for this partition. end_level is already a + * 1-indexed level number, so the filename uses it directly -- it must + * match the level the finalizer records in the manifest, or recovery + * will see a file at a level the manifest does not know and delete it */ + uint64_t new_id = atomic_fetch_add(&cf->next_sstable_id, 1); + char path[MAX_FILE_PATH_LENGTH]; + snprintf(path, sizeof(path), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d" TDB_LEVEL_PARTITION_PREFIX "%d", + cf->directory, end_level, partition); + + tidesdb_sstable_t *new_sst = tidesdb_sstable_create(cf->db, path, new_id, &cf->config); + if (new_sst) + { + block_manager_t *klog_bm = NULL; + block_manager_t *vlog_bm = NULL; + + /* open the partition's output sstable. on failure (e.g. EMFILE under fd pressure) we + * MUST NOT proceed -- the merge loop below writes through klog_bm/vlog_bm and would + * dereference a NULL block manager. abort the merge instead; the aborted path preserves + * the source sstables, so no data is lost and compaction retries later. routed through + * tidesdb_bm_open so a transient fd spike gets a reaper-assisted retry first. */ + if (tidesdb_bm_open(cf->db, &klog_bm, new_sst->klog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0 || + tidesdb_bm_open(cf->db, &vlog_bm, new_sst->vlog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' partitioned merge failed to open output sstable for " + "partition %d: %s -- aborting (sources preserved)", + cf->name, partition, strerror(errno)); + if (klog_bm) block_manager_close(klog_bm); + if (vlog_bm) block_manager_close(vlog_bm); + tidesdb_sstable_unref(cf->db, new_sst); + tidesdb_merge_heap_free(heap); + aborted = 1; + break; + } + + bloom_filter_t *bloom = NULL; + tidesdb_block_index_t *block_indexes = NULL; + + if (cf->config.enable_bloom_filter) + { + if (bloom_filter_new(&bloom, cf->config.bloom_fpr, (int)estimated_entries) == 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "Partitioned merge partition %d bloom filter created (estimated entries: " + "%" PRIu64 ")", + partition, estimated_entries); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Partitioned merge partition %d bloom filter creation failed", + partition); + bloom = NULL; + } + } + + if (cf->config.enable_block_indexes && !cf->config.use_btree) + { + /* we reuse comparator_fn and comparator_ctx from outer scope */ + block_indexes = + compact_block_index_create(estimated_entries, cf->config.block_index_prefix_len, + comparator_fn, comparator_ctx); + } + + /* btree output. is_largest_level mirrors the non-btree branch + * below so a partition that targets L can still reap tombstones */ + if (cf->config.use_btree) + { + int btree_result = tidesdb_sstable_write_from_heap_btree( + cf, new_sst, heap, klog_bm, vlog_bm, bloom, NULL, targeting_largest); + block_manager_close(klog_bm); + block_manager_close(vlog_bm); + tidesdb_merge_heap_free(heap); + + bloom = NULL; + + if (btree_result != TDB_SUCCESS || new_sst->num_entries == 0) + { + if (new_sst->num_entries == 0) + { + remove(new_sst->klog_path); + remove(new_sst->vlog_path); + } + tidesdb_sstable_unref(cf->db, new_sst); + continue; + } + + /* we add the btree sstable to target level (commit serialized across partitions) */ + pthread_mutex_lock(&cf->compaction_commit_lock); + tidesdb_level_add_sstable(cf->levels[end_idx], new_sst); + tidesdb_bump_sstable_layout_version(cf); + tidesdb_manifest_add_sstable(cf->manifest, cf->levels[end_idx]->level_num, + new_sst->id, new_sst->num_entries, + new_sst->klog_size + new_sst->vlog_size); + pthread_mutex_unlock(&cf->compaction_commit_lock); + tidesdb_sstable_unref(cf->db, new_sst); + continue; + } + + /* we merge and write entries in partition range */ + tidesdb_klog_block_t *klog_block = tidesdb_klog_block_create(); + uint64_t entry_count = 0; + uint64_t tombstone_count = 0; + uint64_t klog_block_num = 0; + uint64_t vlog_block_num = 0; + uint64_t max_seq = 0; + uint8_t *first_key = NULL; + size_t first_key_size = 0; + uint8_t *last_key = NULL; + size_t last_key_size = 0; + + /* we track first and last key of current block for block index */ + uint8_t *block_first_key = NULL; + size_t block_first_key_size = 0; + uint8_t *block_last_key = NULL; + size_t block_last_key_size = 0; + + /* we track last key for duplicate detection */ + uint8_t *last_seen_key = NULL; + size_t last_seen_key_size = 0; + + while (!tidesdb_merge_heap_empty(heap)) + { + tidesdb_kv_pair_t *kv = tidesdb_merge_heap_pop(heap, NULL); + if (!kv) break; + + skip_list_comparator_fn cmp_fn = NULL; + void *cmp_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &cmp_fn, &cmp_ctx); + + /* we check if key is in partition range */ + if (range_start && + cmp_fn(kv->key, kv->entry.key_size, range_start, range_start_size, cmp_ctx) < 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + if (range_end && + cmp_fn(kv->key, kv->entry.key_size, range_end, range_end_size, cmp_ctx) >= 0) + { + tidesdb_kv_pair_free(kv); + break; + } + + /* we skip duplicate keys (keep newest based on seq) */ + if (last_seen_key && last_seen_key_size == kv->entry.key_size && + memcmp(last_seen_key, kv->key, last_seen_key_size) == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + /* we update last seen key for duplicate detection */ + free(last_seen_key); + last_seen_key = malloc(kv->entry.key_size); + if (last_seen_key) + { + memcpy(last_seen_key, kv->key, kv->entry.key_size); + last_seen_key_size = kv->entry.key_size; + } + + /* single-delete pair-cancel if kv is a single-delete and the + * next entry still on the heap is a live put for the same key, + * both can be dropped together. we peek the heap's top source + * instead of restructuring this loop with a one-step buffer + * because this path has a mid-loop sstable-split on file_max + * that is awkward to reorder. the same-key dedup below then + * sweeps the paired put on the next iteration. */ + if ((kv->entry.flags & TDB_KV_FLAG_SINGLE_DELETE) && + !tidesdb_merge_heap_empty(heap)) + { + const tidesdb_kv_pair_t *peek = heap->sources[0]->current_kv; + if (peek && peek->entry.key_size == kv->entry.key_size && + memcmp(peek->key, kv->key, kv->entry.key_size) == 0 && + !(peek->entry.flags & TDB_KV_FLAG_TOMBSTONE)) + { + tidesdb_kv_pair_free(kv); + continue; + } + } + + /* reap a plain tombstone only when this partition merges into + * the largest level -- nothing older exists below it then. + * when targeting a shallower level tombstones must survive. */ + if (targeting_largest && (kv->entry.flags & TDB_KV_FLAG_TOMBSTONE)) + { + tidesdb_kv_pair_free(kv); + continue; + } + + if (kv->entry.ttl > 0 && + kv->entry.ttl < + atomic_load_explicit(&cf->db->cached_current_time, memory_order_relaxed)) + { + tidesdb_kv_pair_free(kv); + continue; + } + + if (!first_key) + { + first_key = malloc(kv->entry.key_size); + if (first_key) + { + memcpy(first_key, kv->key, kv->entry.key_size); + first_key_size = kv->entry.key_size; + } + } + + if (last_key) free(last_key); + last_key = malloc(kv->entry.key_size); + if (last_key) + { + memcpy(last_key, kv->key, kv->entry.key_size); + last_key_size = kv->entry.key_size; + } + + if (kv->entry.value_size >= cf->config.klog_value_threshold && kv->value) + { + uint8_t *final_data = kv->value; + size_t final_size = kv->entry.value_size; + uint8_t *compressed = NULL; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + compressed = + compress_data(kv->value, kv->entry.value_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *vblock = + block_manager_block_create(final_size, final_data); + if (vblock) + { + int64_t block_offset = block_manager_block_write(vlog_bm, vblock); + if (block_offset >= 0) + { + kv->entry.vlog_offset = (uint64_t)block_offset; + vlog_block_num++; + } + block_manager_block_release(vblock); + } + free(compressed); + } + + if (bloom) + { + bloom_filter_add(bloom, kv->key, kv->entry.key_size); + } + + /* we check if this is first entry in a new block (before adding) */ + int is_first_entry_in_block = (klog_block->num_entries == 0); + + tidesdb_klog_block_add_entry(klog_block, kv, &cf->config, comparator_fn, + comparator_ctx); + + /* we track first key of block */ + if (is_first_entry_in_block) + { + free(block_first_key); + block_first_key = malloc(kv->entry.key_size); + if (block_first_key) + { + memcpy(block_first_key, kv->key, kv->entry.key_size); + block_first_key_size = kv->entry.key_size; + } + } + + /* we always update last key of block */ + free(block_last_key); + block_last_key = malloc(kv->entry.key_size); + if (block_last_key) + { + memcpy(block_last_key, kv->key, kv->entry.key_size); + block_last_key_size = kv->entry.key_size; + } + + /** we track maximum sequence number */ + if (kv->entry.seq > max_seq) + { + max_seq = kv->entry.seq; + } + + entry_count++; + if (kv->entry.flags & TDB_KV_FLAG_TOMBSTONE) tombstone_count++; + + if (tidesdb_klog_block_is_full(klog_block, TDB_KLOG_BLOCK_SIZE)) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(klog_block, &klog_data, &klog_size) == 0) + { + uint8_t *final_data = klog_data; + size_t final_size = klog_size; + + if (cf->config.compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = + compress_data(klog_data, klog_size, &compressed_size, + cf->config.compression_algorithm); + if (compressed) + { + free(klog_data); + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *block = + block_manager_block_create(final_size, final_data); + if (block) + { + /* we capture file position before writing the block */ + uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + + block_manager_block_write(klog_bm, block); + block_manager_block_release(block); + + /* we add completed block to index after writing with file position */ + if (block_indexes && block_first_key && block_last_key) + { + /* we sample every Nth block (ratio validated to be >= 1) */ + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add( + block_indexes, block_first_key, block_first_key_size, + block_last_key, block_last_key_size, block_file_position); + } + } + + klog_block_num++; + } + free(final_data); + } + tidesdb_klog_block_free(klog_block); + klog_block = tidesdb_klog_block_create(); + + /* we reset block tracking for new block */ + free(block_first_key); + free(block_last_key); + block_first_key = NULL; + block_last_key = NULL; + + /*** spooky file_max splits if output exceeds C_X, finalize this + ** sstable and start a new one within the same partition. + * per algorithm 2 of the spooky paper. */ + if (file_max > 0 && entry_count > 0) + { + uint64_t current_klog_size = atomic_load(&klog_bm->current_file_size); + if (current_klog_size >= file_max) + { + /* we assign min/max keys to current sst before finalizing */ + new_sst->min_key = first_key; + new_sst->min_key_size = first_key_size; + new_sst->max_key = last_key; + new_sst->max_key_size = last_key_size; + first_key = NULL; + last_key = NULL; + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partition %d SSTable %" PRIu64 + " reached file_max (%zu >= %zu), splitting", + partition, new_sst->id, (size_t)current_klog_size, + file_max); + + tdb_partitioned_merge_finalize_sst( + cf, new_sst, klog_bm, vlog_bm, bloom, block_indexes, entry_count, + tombstone_count, klog_block_num, vlog_block_num, max_seq, end_level, + partition); + + /* we create replacement sst for remaining entries in this partition */ + uint64_t split_id = atomic_fetch_add(&cf->next_sstable_id, 1); + char split_path[MAX_FILE_PATH_LENGTH]; + /* end_level is 1-indexed -- filename uses it directly + * so it matches the manifest level (see above) */ + snprintf(split_path, sizeof(split_path), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX + "%d" TDB_LEVEL_PARTITION_PREFIX "%d", + cf->directory, end_level, partition); + + new_sst = + tidesdb_sstable_create(cf->db, split_path, split_id, &cf->config); + if (!new_sst) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Partition %d failed to create split SSTable", + partition); + /* we drain remaining heap entries for this partition */ + while (!tidesdb_merge_heap_empty(heap)) + { + tidesdb_kv_pair_t *drain = tidesdb_merge_heap_pop(heap, NULL); + if (drain) + tidesdb_kv_pair_free(drain); + else + break; + } + /* the prior split was already finalized (it consumed klog_bm, + * vlog_bm, bloom, block_indexes), so NULL them before the post-loop + * finalize guard runs -- otherwise it would double-free/close them. + * abort so the sources are preserved (no data loss; retried). */ + klog_bm = NULL; + vlog_bm = NULL; + bloom = NULL; + block_indexes = NULL; + aborted = 1; + break; + } + + klog_bm = NULL; + vlog_bm = NULL; + /* open the split (continuation) output. same hazard as the partition's + * first output, on failure we must not write through a NULL block + * manager. abort cleanly -- the previous split was already finalized, + * and the aborted path preserves the sources, so reads still find every + * key (dedup by seq) and compaction retries. the post-loop finalize is + * guarded on new_sst/klog_bm so the NULL'd state below is never used. + */ + if (tidesdb_bm_open( + cf->db, &klog_bm, new_sst->klog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0 || + tidesdb_bm_open( + cf->db, &vlog_bm, new_sst->vlog_path, + convert_sync_mode(cf->config.sync_mode == TDB_SYNC_INTERVAL + ? TDB_SYNC_FULL + : cf->config.sync_mode)) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' partitioned merge failed to open split " + "output for partition %d: %s -- aborting", + cf->name, partition, strerror(errno)); + if (klog_bm) block_manager_close(klog_bm); + if (vlog_bm) block_manager_close(vlog_bm); + tidesdb_sstable_unref(cf->db, new_sst); + new_sst = NULL; + klog_bm = NULL; + vlog_bm = NULL; + bloom = + NULL; /* consumed by the prior finalize -- don't reuse/free */ + block_indexes = NULL; + aborted = 1; + break; + } + + bloom = NULL; + block_indexes = NULL; + if (cf->config.enable_bloom_filter) + { + /* bloom_filter_new nulls bloom on failure + * (see contract in src/bloom_filter.c), so a + * miss here leaves bloom NULL and the merge + * loop skips bloom_filter_add */ + if (bloom_filter_new(&bloom, cf->config.bloom_fpr, + (int)estimated_entries) != 0) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "Partitioned merge partition %d bloom_filter_new " + "failed on file_max split (estimated_entries=%" PRIu64 + "), continuing without bloom for this split sstable", + partition, estimated_entries); + } + } + if (cf->config.enable_block_indexes && !cf->config.use_btree) + { + block_indexes = compact_block_index_create( + estimated_entries, cf->config.block_index_prefix_len, + comparator_fn, comparator_ctx); + } + + /* we reset per-sst counters */ + entry_count = 0; + tombstone_count = 0; + klog_block_num = 0; + vlog_block_num = 0; + max_seq = 0; + first_key = NULL; + first_key_size = 0; + last_key = NULL; + last_key_size = 0; + } + } + } + + tidesdb_kv_pair_free(kv); + } + + /* we clean up duplicate detection tracking */ + free(last_seen_key); + + /* we write remaining block -- skipped when an output open aborted the merge (new_sst or + * klog_bm NULL), since writing through a NULL block manager would crash and the sources + * are being preserved anyway */ + if (klog_block->num_entries > 0 && new_sst && klog_bm) + { + uint8_t *klog_data; + size_t klog_size; + if (tidesdb_klog_block_serialize(klog_block, &klog_data, &klog_size) == 0) + { + uint8_t *final_data = klog_data; + size_t final_size = klog_size; + + if (new_sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t compressed_size; + uint8_t *compressed = compress_data(klog_data, klog_size, &compressed_size, + new_sst->config->compression_algorithm); + if (compressed) + { + free(klog_data); + final_data = compressed; + final_size = compressed_size; + } + } + + block_manager_block_t *block = + block_manager_block_create(final_size, final_data); + if (block) + { + uint64_t block_file_position = atomic_load(&klog_bm->current_file_size); + block_manager_block_write(klog_bm, block); + block_manager_block_release(block); + + if (block_indexes && block_first_key && block_last_key) + { + if (klog_block_num % cf->config.index_sample_ratio == 0) + { + compact_block_index_add(block_indexes, block_first_key, + block_first_key_size, block_last_key, + block_last_key_size, block_file_position); + } + } + + klog_block_num++; + } + free(final_data); + } + } + + tidesdb_klog_block_free(klog_block); + free(block_first_key); + free(block_last_key); + + /* we assign min/max keys and finalize via helper -- unless an output open aborted the + * merge, in which case we must not finalize through a NULL block manager. release this + * partition's still-owned resources and leave the sources intact (aborted path below). + */ + if (new_sst && klog_bm && vlog_bm) + { + new_sst->min_key = first_key; + new_sst->min_key_size = first_key_size; + new_sst->max_key = last_key; + new_sst->max_key_size = last_key_size; + + tdb_partitioned_merge_finalize_sst( + cf, new_sst, klog_bm, vlog_bm, bloom, block_indexes, entry_count, + tombstone_count, klog_block_num, vlog_block_num, max_seq, end_level, partition); + } + else + { + free(first_key); + free(last_key); + if (bloom) bloom_filter_free(bloom); + if (block_indexes) compact_block_index_free(block_indexes); + if (klog_bm) block_manager_close(klog_bm); + if (vlog_bm) block_manager_close(vlog_bm); + if (new_sst) tidesdb_sstable_unref(cf->db, new_sst); + aborted = 1; + } + } + + tidesdb_merge_heap_free(heap); + } while (0); + + if (aborted) atomic_store_explicit(&c->aborted, 1, memory_order_release); + return TDB_SUCCESS; +} + +/** + * tidesdb_cf_dense_tombstone_witness + * walks every level looking for an sstable whose tombstone density is at or above + * the configured trigger ratio. on a hit we record the offending sstable's level + * and density via out-parameters so the caller can log specific context, and -- + * when out_min_key/out_max_key are supplied -- a malloc'd copy of the witness + * sstable's key range so the caller can steer a targeted merge at it. we return + * early on the first hit. sstables with TDB_TOMBSTONE_COUNT_UNKNOWN (legacy + * footers without SSTABLE_FLAG_TOMBSTONE_COUNT) or fewer than min_entries are + * skipped -- we don't escalate on guesses or on sstables too small for the + * ratio to be meaningful. + * + * @param cf the column family + * @param threshold density ratio in (0.0, 1.0] + * @param min_entries minimum sstable entry count for density to count + * @param out_level optional, set to the 1-based level number of the witness on hit + * @param out_density optional, set to the witness sstable's density on hit + * @param out_min_key optional, set to a malloc'd copy of the witness min key on hit + * (caller frees); paired with out_min_key_size + * @param out_min_key_size optional, set to the witness min key size on hit + * @param out_max_key optional, set to a malloc'd copy of the witness max key on hit + * (caller frees); paired with out_max_key_size + * @param out_max_key_size optional, set to the witness max key size on hit + * @return 1 if any sstable meets or exceeds the threshold, 0 otherwise + */ +static int tidesdb_cf_dense_tombstone_witness(tidesdb_column_family_t *cf, double threshold, + uint64_t min_entries, int *out_level, + double *out_density, uint8_t **out_min_key, + size_t *out_min_key_size, uint8_t **out_max_key, + size_t *out_max_key_size) +{ + if (threshold <= 0.0) return 0; + + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + for (int lv = 0; lv < num_levels; lv++) + { + tidesdb_level_t *lvl = cf->levels[lv]; + if (!lvl) continue; + + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + + const int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **ssts = atomic_load_explicit(&lvl->sstables, memory_order_acquire); + + int hit = 0; + double witness_density = 0.0; + uint8_t *witness_min = NULL, *witness_max = NULL; + size_t witness_min_size = 0, witness_max_size = 0; + for (int i = 0; ssts && i < num_ssts; i++) + { + tidesdb_sstable_t *sst = ssts[i]; + if (!sst) continue; + if (sst->tombstone_count == TDB_TOMBSTONE_COUNT_UNKNOWN) continue; + if (sst->num_entries < min_entries) continue; + + /* fp multiply rather than divide -- one mul per sstable, identical + * semantics, no zero-divide branch */ + const double bound = (double)sst->num_entries * threshold; + if ((double)sst->tombstone_count >= bound) + { + hit = 1; + witness_density = (double)sst->tombstone_count / (double)sst->num_entries; + /* copy the key range while we still hold array_readers on the + * level so the sstable cannot be retired from under us */ + if (out_min_key && sst->min_key && sst->min_key_size > 0) + { + witness_min = malloc(sst->min_key_size); + if (witness_min) + { + memcpy(witness_min, sst->min_key, sst->min_key_size); + witness_min_size = sst->min_key_size; + } + } + if (out_max_key && sst->max_key && sst->max_key_size > 0) + { + witness_max = malloc(sst->max_key_size); + if (witness_max) + { + memcpy(witness_max, sst->max_key, sst->max_key_size); + witness_max_size = sst->max_key_size; + } + } + break; + } + } + + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + if (hit) + { + if (out_level) *out_level = lv + 1; + if (out_density) *out_density = witness_density; + if (out_min_key) *out_min_key = witness_min; + if (out_min_key_size) *out_min_key_size = witness_min_size; + if (out_max_key) *out_max_key = witness_max; + if (out_max_key_size) *out_max_key_size = witness_max_size; + return 1; + } + } + return 0; +} + +/** + * tidesdb_trigger_compaction + * trigger compaction for a column family using the spooky algorithm + * + * spooky implementation notes + * -- we implement the generalized spooky algorithm (section 4.2 of the paper) + * -- parameter X (dividing level) is configurable via dividing_level_offset + * -- we perform full preemptive merge at levels 1 to X-1 (array indices 0 to X-2) + * -- we perform dividing merge into level X (partitioned by largest level boundaries) + * -- we perform partitioned preemptive merge at levels X to L when level X is full + * -- we use spooky algo 2 to find target levels (smallest level that cannot accommodate) + * + * key differences from paper: + * -- we use 0-based array indexing (paper uses 1-based level numbering) + * -- level 0 is memtable in paper, but we treat level 1 (array index 0) as first disk level + * + * @param cf the column family + * @return TDB_SUCCESS on success, error code on failure + */ +int tidesdb_trigger_compaction(tidesdb_column_family_t *cf, int full_compaction) +{ + /* we check if CF is marked for deletion before doing any work */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + return TDB_SUCCESS; + } + + int expected = 0; + if (!atomic_compare_exchange_strong_explicit(&cf->is_compacting, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + /* another compaction is already running. callers that care (the + * compaction worker on a blocking work item) requeue; callers that do + * not (the legacy direct paths) treat this as a coalesced skip */ + return TDB_ERR_LOCKED; + } + + /* we check again after acquiring is_compacting in case drop happened between checks */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_SUCCESS; + } + + /* we update cached_current_time to ensure TTL checks during compaction use fresh time + * this prevents race conditions where stale cached time causes expired keys to not be filtered + */ + atomic_store(&cf->db->cached_current_time, tdb_get_current_time()); + + /* we force flush memtable before compaction to ensure all data is in ssts + * this prevents data loss where keys in memtable are not included in compaction */ + tidesdb_flush_memtable_internal(cf, 0, 1); + + /* wait for the forced flush to fully complete before compaction reads the + * levels. flush_pending_count is decremented only after the worker finishes + * writing the sstable, whereas the flush queue empties as soon as a work + * item is dequeued -- and it is db-global, so it also reflects unrelated + * CFs' flushes */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (!tidesdb_is_flushing(cf)) break; + if (tidesdb_cf_abort_requested(cf)) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + if (tidesdb_cf_abort_requested(cf)) + { + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_SUCCESS; + } + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Triggering compaction for column family %s (levels: %d)", cf->name, + num_levels); + + /* a manual tidesdb_compact() runs a full compaction -- merge every level + * into the largest so all garbage is reclaimed. the geometry-driven spooky + * path below only fires when a level is over capacity, so on its own it + * cannot reclaim single-delete pairs or tombstones split across two + * under-capacity levels. */ + if (full_compaction) + { + int result = TDB_SUCCESS; + if (num_levels >= 1) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Full manual compaction for CF '%s' -- merging all %d level(s) into " + "the largest level", + cf->name, num_levels); + result = tidesdb_full_preemptive_merge(cf, 0, num_levels - 1, num_levels - 1); + } + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return result; + } + + /* we calculate X (dividing level) */ + int X = num_levels - 1 - cf->config.dividing_level_offset; + if (X < 1) X = 1; + + int target_lvl = X; /* default to X if no suitable level found */ + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Calculating target compaction level (X=%d)", X); + + /* spooky algo 2 -- target_lvl is the smallest level q that would not reach + * capacity if all data at levels 0..q were merged into it, i.e. the + * smallest q where C_q >= Σ(N_i) for i=0..q. the merge then deposits the + * run at a level that has room, which is what lets data flow downward. + * (the spooky paper states this as "wouldn't reach capacity"; selecting the + * first level that CANNOT accommodate instead pins target_lvl at 1 and + * self-merges level 1 forever.) + * q is a 1-indexed level number -- array index is q-1. this matches the + * dividing/partitioned merge calls below and the z-loop, which all convert + * with -1 */ + for (int q = 1; q <= X && q <= num_levels; q++) + { + size_t cumulative_size = 0; + + /* cumulative data at levels 1..q -- array indices 0..q-1 */ + for (int i = 0; i < q && i < num_levels; i++) + { + cumulative_size += + atomic_load_explicit(&cf->levels[i]->current_size, memory_order_relaxed); + } + + /* we check if C_q >= cumulative_size (level q can accommodate the merge) */ + size_t level_q_capacity = + atomic_load_explicit(&cf->levels[q - 1]->capacity, memory_order_relaxed); + if (level_q_capacity >= cumulative_size) + { + /* we found smallest level that can accommodate -- this is our target */ + target_lvl = q; + TDB_DEBUG_LOG(TDB_LOG_INFO, "Target level %d capacity=%zu >= cumulative_size=%zu", q, + level_q_capacity, cumulative_size); + break; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Final target compaction level: %d", target_lvl); + + int result = TDB_SUCCESS; + if (target_lvl < X) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Full preemptive merge levels 1 to %d", target_lvl); + result = tidesdb_full_preemptive_merge(cf, 0, target_lvl - 1, + target_lvl - 1); /* convert to 0-indexed */ + } + else if (target_lvl == X) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Dividing merge at level %d", X); + result = tidesdb_dividing_merge(cf, X - 1); /* convert to 0-indexed */ + } + else + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Target_lvl > X, defaulting to dividing merge"); + result = tidesdb_dividing_merge(cf, X - 1); /* convert to 0-indexed */ + } + + if (tidesdb_cf_abort_requested(cf)) + { + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_SUCCESS; + } + + /* we reload num_levels atomically after compaction */ + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + /* we recalculate X with potentially new num_levels */ + X = num_levels - 1 - cf->config.dividing_level_offset; + if (X < 1) X = 1; + + int z = -1; + int need_partitioned_merge = 0; + + if (X > 0 && X < num_levels) + { + tidesdb_level_t *level_x = cf->levels[X - 1]; + + size_t level_x_size = atomic_load_explicit(&level_x->current_size, memory_order_relaxed); + size_t level_x_capacity = atomic_load_explicit(&level_x->capacity, memory_order_relaxed); + + if (level_x_size >= level_x_capacity) + { + need_partitioned_merge = 1; + + /* spooky algo 2 -- z is the smallest level X+1..L that would not + * reach capacity if all data at levels X..z were merged into it, + * i.e. the smallest z where C_z >= Σ(N_i) for i=X to z */ + for (int candidate_z = X + 1; candidate_z <= num_levels; candidate_z++) + { + size_t cumulative = 0; + for (int i = X; i <= candidate_z && (i - 1) < num_levels; i++) + { + cumulative += atomic_load_explicit(&cf->levels[i - 1]->current_size, + memory_order_relaxed); + } + + size_t candidate_capacity = atomic_load_explicit( + &cf->levels[candidate_z - 1]->capacity, memory_order_relaxed); + if (candidate_capacity >= cumulative) + { + z = candidate_z; + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Partitioned merge target z=%d capacity=%zu >= cumulative=%zu", + candidate_z, candidate_capacity, cumulative); + break; + } + } + + if (z == -1 || z <= X) + { + z = num_levels; + } + } + } + + /* we get largest level info for later checks */ + if (num_levels == 0) + { + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_SUCCESS; + } + + tidesdb_level_t *largest = cf->levels[num_levels - 1]; + size_t largest_size = atomic_load_explicit(&largest->current_size, memory_order_relaxed); + size_t largest_capacity = atomic_load_explicit(&largest->capacity, memory_order_relaxed); + + /* we perform partitioned merge if needed */ + if (need_partitioned_merge) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Level %d is full, triggering partitioned preemptive merge", X); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Partitioned preemptive merge levels %d to %d", X, z); + result = tidesdb_partitioned_merge(cf, X, z); + + if (tidesdb_cf_abort_requested(cf)) + { + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_SUCCESS; + } + + /* we reload num_levels after merge */ + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (num_levels > 0) + { + largest = cf->levels[num_levels - 1]; + largest_size = atomic_load_explicit(&largest->current_size, memory_order_relaxed); + largest_capacity = atomic_load_explicit(&largest->capacity, memory_order_relaxed); + } + } + + int just_added_level = 0; + int just_collapsed = 0; + if (largest_size >= largest_capacity) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Largest size is %zu, Largest capacity %zu, Number of levels %d", + largest_size, largest_capacity, num_levels); + tidesdb_add_level(cf); + just_added_level = 1; /* track that we just added a level */ + /* we re-fetch num_levels after add_level */ + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (num_levels > 0) + { + largest = cf->levels[num_levels - 1]; + largest_size = atomic_load_explicit(&largest->current_size, memory_order_relaxed); + largest_capacity = atomic_load_explicit(&largest->capacity, memory_order_relaxed); + } + } + else if (largest_size > 0 && num_levels >= 2 && num_levels > cf->config.min_levels && + cf->config.level_size_ratio > 0 && + largest_size < largest_capacity / (size_t)cf->config.level_size_ratio) + { + /* spooky algo 2 --- the largest level has shrunk below C_L/T. + * we collapse it into level L-1 -- a full preemptive merge whose output + * is written one level shallower -- then remove the now-empty largest + * level. tidesdb_remove_level sets the new largest's capacity to C_L/T. + * the collapse merges the deepest two levels, so its output is the new + * bottom is_largest_level stays true and tombstones drop correctly. */ + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' largest level underfull (size=%zu < capacity/T) - collapsing " + "level %d into level %d", + cf->name, largest_size, num_levels, num_levels - 1); + int collapse_rc = + tidesdb_full_preemptive_merge(cf, num_levels - 2, num_levels - 1, num_levels - 2); + if (collapse_rc == TDB_SUCCESS && !tidesdb_cf_abort_requested(cf)) + { + tidesdb_remove_level(cf); + just_collapsed = 1; + } + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (num_levels > 0) + { + largest = cf->levels[num_levels - 1]; + largest_size = atomic_load_explicit(&largest->current_size, memory_order_relaxed); + largest_capacity = atomic_load_explicit(&largest->capacity, memory_order_relaxed); + } + } + + /* we check if largest level is truly empty by checking num_sstables, not current_size + * current_size uses relaxed memory ordering and can be stale + * we re-fetch levels and largest pointer as they may have changed due to compactions + * + * we dont remove a level we just added in this same compaction cycle! + * the new level is intentionally empty and will be filled by future compactions. */ + + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + int largest_num_sstables = + (num_levels > 1) + ? atomic_load_explicit(&cf->levels[num_levels - 1]->num_sstables, memory_order_acquire) + : -1; + + if (!just_added_level && !just_collapsed && num_levels > 1 && largest_num_sstables == 0) + { + size_t pending_flushes = queue_size(cf->immutable_memtables); + + int level1_sstables = + (cf->levels[0] != NULL) + ? atomic_load_explicit(&cf->levels[0]->num_sstables, memory_order_acquire) + : 0; + + if (pending_flushes == 0 && level1_sstables == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Largest level is empty, removing level for CF '%s'", + cf->name); + tidesdb_remove_level(cf); + num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + } + else + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "Largest level is empty but work pending (flushes: %zu, L1 sstables: %d), keeping " + "level for CF '%s'", + pending_flushes, level1_sstables, cf->name); + } + } + + tidesdb_apply_dca(cf); + + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return result; +} + +/** + * tidesdb_wal_recover + * recover the WAL + * @param cf the column family + * @param wal_path the path to the WAL + * @param memtable the memtable + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS on failure + */ +static int tidesdb_wal_recover(tidesdb_column_family_t *cf, const char *wal_path, + skip_list_t **memtable) +{ + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' starting WAL recovery from: %s", cf->name, wal_path); + block_manager_t *wal; + if (block_manager_open(&wal, wal_path, TDB_SYNC_FULL) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "CF '%s' failed to open WAL: %s", cf->name, wal_path); + return TDB_ERR_IO; + } + + /** we hint to OS that we'll read the entire WAL sequentially and only once + * this optimizes read-ahead and allows kernel to deprioritize these pages */ + set_file_sequential_hint(wal->fd); + set_file_noreuse_hint(wal->fd, 0, 0); + + /* we prefetch WAL file into page cache for faster recovery */ + const uint64_t wal_size = atomic_load(&wal->current_file_size); + if (wal_size > 0) + { + prefetch_file_region(wal->fd, 0, (off_t)wal_size); + } + + if (block_manager_validate_last_block(wal, BLOCK_MANAGER_PERMISSIVE_BLOCK_VALIDATION) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' WAL validation failed: %s", cf->name, wal_path); + block_manager_close(wal); + return TDB_ERR_IO; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' WAL validation passed: %s", cf->name, wal_path); + + /* we resolve comparator for recovered memtable */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + if (tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx) != 0) + { + /* comparator not found, use default memcmp */ + comparator_fn = skip_list_comparator_memcmp; + comparator_ctx = NULL; + } + + if (skip_list_new_with_arena(memtable, cf->config.skip_list_max_level, + cf->config.skip_list_probability, comparator_fn, comparator_ctx, + &cf->db->cached_current_time, + cf->config.write_buffer_size * 2) != 0) + { + block_manager_close(wal); + return TDB_ERR_MEMORY; + } + + /* replay every entry from the wal into the freshly allocated memtable */ + const int replay_rc = tidesdb_wal_replay_into(cf, wal, *memtable); + if (replay_rc != TDB_SUCCESS) + { + skip_list_free(*memtable); + *memtable = NULL; + block_manager_close(wal); + return replay_rc; + } + + /* we evict WAL data from page cache after recovery, data is now in memtable + * this frees cache space for more useful data during normal operation */ + evict_file_region(wal->fd, 0, 0); + block_manager_close(wal); + return TDB_SUCCESS; +} + +/** + * tidesdb_wal_replay_into + * replays every entry from an already-open, already-validated wal block manager + * into target. shared by tidesdb_wal_recover (fresh skip list for an immutable + * wal) and by the adopt-active-wal recovery path (replay in place into the live + * active memtable skip list). the caller owns the wal block manager lifecycle. + * @param cf the column family (for logging) + * @param wal an open, validated wal block manager + * @param target the skip list to replay entries into + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_wal_replay_into(tidesdb_column_family_t *cf, block_manager_t *wal, + skip_list_t *target) +{ + block_manager_cursor_t *cursor; + if (block_manager_cursor_init(&cursor, wal) != 0) return TDB_ERR_IO; + + int block_count = 0; + int entry_count = 0; + if (block_manager_cursor_goto_first(cursor) == 0) + { + while (1) + { + block_manager_block_t *block = block_manager_cursor_read(cursor); + if (!block) + { + /* partial write, header valid but footer absent -- skip slot and resume */ + if (block_manager_cursor_skip_corrupt(cursor) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' WAL recovery: skipped partial write, resuming replay", + cf->name); + continue; + } + break; /* genuine corruption or zero-filled hole; stop replay */ + } + block_count++; + + const uint8_t *ptr = block->data; + size_t remaining = block->size; + + while (remaining > 0) + { + if (remaining < 1) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' WAL block has insufficient data for entry (remaining: %zu)", + cf->name, remaining); + break; + } + + tidesdb_klog_entry_t entry; + entry.flags = *ptr++; + remaining--; + entry_count++; + + uint64_t key_size_u64; + int bytes_read = decode_varint(ptr, &key_size_u64, (int)remaining); + if (bytes_read < 0 || key_size_u64 > UINT32_MAX) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' WAL entry %d invalid key_size", cf->name, + entry_count); + break; + } + ptr += bytes_read; + remaining -= bytes_read; + entry.key_size = (uint32_t)key_size_u64; + + uint64_t value_size_u64; + bytes_read = decode_varint(ptr, &value_size_u64, (int)remaining); + if (bytes_read < 0 || value_size_u64 > UINT32_MAX) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' WAL entry %d invalid value_size", cf->name, + entry_count); + break; + } + ptr += bytes_read; + remaining -= bytes_read; + entry.value_size = (uint32_t)value_size_u64; + + uint64_t seq_value; + bytes_read = decode_varint(ptr, &seq_value, (int)remaining); + if (bytes_read < 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' WAL entry %d invalid seq", cf->name, + entry_count); + break; + } + ptr += bytes_read; + remaining -= bytes_read; + entry.seq = seq_value; + + if (entry.flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' WAL entry %d insufficient data for TTL", cf->name, + entry_count); + break; + } + entry.ttl = decode_int64_le_compat(ptr); + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + else + { + entry.ttl = 0; + } + + entry.vlog_offset = 0; + + if (remaining < entry.key_size) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' WAL entry %d insufficient data for key (need %u, have %zu)", + cf->name, entry_count, entry.key_size, remaining); + break; + } + + uint8_t *key = (uint8_t *)ptr; + ptr += entry.key_size; + remaining -= entry.key_size; + + uint8_t *value = NULL; + if (entry.value_size > 0) + { + if (remaining < entry.value_size) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' WAL entry %d insufficient data for value (need %u, have %zu)", + cf->name, entry_count, entry.value_size, remaining); + break; + } + value = (uint8_t *)ptr; + ptr += entry.value_size; + remaining -= entry.value_size; + } + + if (entry.flags & TDB_KV_FLAG_TOMBSTONE) + { + /*** we preserve the single-delete subtype across crash so compaction + ** can still pair-cancel put+single-delete for entries that were + * only in the wal at the time of the crash. */ + uint8_t sl_flags = SKIP_LIST_FLAG_DELETED; + if (entry.flags & TDB_KV_FLAG_SINGLE_DELETE) + sl_flags |= SKIP_LIST_FLAG_SINGLE_DELETE; + skip_list_put_with_seq(target, key, entry.key_size, NULL, 0, 0, entry.seq, + sl_flags); + } + else + { + skip_list_put_with_seq(target, key, entry.key_size, value, entry.value_size, + entry.ttl, entry.seq, 0); + } + } + + block_manager_block_release(block); + + if (block_manager_cursor_next(cursor) != 0) break; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' WAL replay completed %d blocks, %d entries, target has %d entries", + cf->name, block_count, entry_count, skip_list_count_entries(target)); + + block_manager_cursor_free(cursor); + return TDB_SUCCESS; +} + +/** + * tidesdb_column_family_free + * free column family + * @param cf the column family + */ +static void tidesdb_column_family_free(tidesdb_column_family_t *cf) +{ + if (!cf) return; + + tidesdb_memtable_t *mt = atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + if (mt) + { + if (mt->skip_list) skip_list_free(mt->skip_list); + if (mt->wal) block_manager_close(mt->wal); + free(mt); + } + + int immutable_count = 0; + while (!queue_is_empty(cf->immutable_memtables)) + { + tidesdb_immutable_memtable_t *immutable = + (tidesdb_immutable_memtable_t *)queue_dequeue(cf->immutable_memtables); + if (immutable) + { + int refcount = atomic_load_explicit(&immutable->refcount, memory_order_acquire); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' is cleaning immutable with refcount=%d", cf->name, + refcount); + tidesdb_immutable_memtable_unref(immutable); + immutable_count++; + } + } + if (immutable_count > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' freed %d immutable memtables in CF cleanup", cf->name, + immutable_count); + } + queue_free(cf->immutable_memtables); + + for (int i = 0; i < TDB_MAX_LEVELS; i++) + { + if (cf->levels[i]) + { + tidesdb_level_free(cf->db, cf->levels[i]); + } + } + + if (cf->manifest) + { + tidesdb_manifest_close(cf->manifest); + } + + pthread_mutex_destroy(&cf->imm_snap_publish_lock); + pthread_mutex_destroy(&cf->compaction_commit_lock); + for (int s = 0; s < TDB_IMM_SNAP_SLOTS; s++) free(cf->imm_snaps[s].items); + free(cf->name); + free(cf->directory); + free(cf); +} + +/** + * tidesdb_unified_immutable_is_flushed + * queue_remove_if predicate -- selects unified immutables whose flush to per-CF + * sstables has completed and are therefore safe to evict from the read path + */ +static int tidesdb_unified_immutable_is_flushed(void *data, void *context) +{ + (void)context; + tidesdb_memtable_t *imm = (tidesdb_memtable_t *)data; + return imm && atomic_load_explicit(&imm->flushed, memory_order_acquire); +} + +/** + * tidesdb_unified_immutable_drop_queue_ref + * queue_remove_if callback -- drops the reference the immutable queue held. + * the structure and its skip list are freed once the last reader also unrefs + */ +static void tidesdb_unified_immutable_drop_queue_ref(void *data, void *context) +{ + (void)context; + tidesdb_immutable_memtable_unref((tidesdb_immutable_memtable_t *)data); +} + +/** + * tidesdb_flush_worker_thread + * worker thread that processes flush work items from the queue + */ +static void *tidesdb_flush_worker_thread(void *arg) +{ + tidesdb_worker_thread_arg_t *targ = (tidesdb_worker_thread_arg_t *)arg; + tidesdb_t *db = targ->db; + char tname[TDB_THREAD_NAME_LEN]; + snprintf(tname, sizeof(tname), TDB_THREAD_PREFIX "flush.%d", targ->index); + tdb_set_thread_name(tname); + free(targ); +#ifndef _WIN32 + { + sigset_t timer_signals; + sigemptyset(&timer_signals); + sigaddset(&timer_signals, SIGALRM); + sigaddset(&timer_signals, SIGVTALRM); + sigaddset(&timer_signals, SIGPROF); + pthread_sigmask(SIG_BLOCK, &timer_signals, NULL); + } +#endif + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flush worker thread started"); + + while (1) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flush worker is waiting for work (queue size: %zu)", + queue_size(db->flush_queue)); + /* we wait for work (blocking dequeue) */ + tidesdb_flush_work_t *work = (tidesdb_flush_work_t *)queue_dequeue_wait(db->flush_queue); + + if (!work) + { + /* NULL sentinel signals shutdown */ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flush worker has received NULL work, exiting"); + break; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flush worker has received work for SSTable %" PRIu64, + work->sst_id); + + /* flush progress heartbeat -- a picked-up work item is forward progress */ + atomic_fetch_add_explicit(&db->flush_heartbeat, 1, memory_order_relaxed); + + tidesdb_column_family_t *cf = work->cf; + tidesdb_immutable_memtable_t *imm = work->imm; + + /*** unified per-cf split task. write this cf's prefix segment of the shared unified skip + ** list to cf as an l1 sstable, then drop our share of the barrier. last finisher closes + * the unified wal and marks the unified memtable flushed. */ + if (work->unified_barrier && work->unified_sl) + { + /* skip the write when the target CF is dropping -- the sstable would + * be unlinked seconds later by remove_directory anyway */ + int wr = TDB_SUCCESS; + if (cf && atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' is marked for deletion, skipping unified split flush", + cf->name); + } + else + { + wr = tidesdb_unified_write_cf_sstable( + db, cf, work->unified_sl, work->unified_cf_index, work->unified_entry_count); + } + if (wr != TDB_SUCCESS) + { + int expected = TDB_SUCCESS; + atomic_compare_exchange_strong_explicit(&work->unified_barrier->overall_result, + &expected, wr, memory_order_acq_rel, + memory_order_relaxed); + } + /* unified_sl is borrowed (the immutable owns it) -- do not free it here */ + tidesdb_unified_flush_barrier_finish(work->unified_barrier); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + if (cf) atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + continue; + } + + /* unified flush dispatch -- cf==NULL means this is a unified memtable flush */ + if (!cf && imm) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flush worker processing unified memtable flush"); + int uflush_rc = tidesdb_unified_flush_immutable(db, imm); + if (uflush_rc != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Unified flush failed (error %d)", uflush_rc); + } + + /* we evict every flushed immutable from the read path -- their data + * now lives in per-CF sstables. queue_remove_if takes the queue write + * lock so it cannot race a snapshot reader; it drops the queue's ref + * per item and the last concurrent reader frees the structure. + * we drain unified_mt.active_mt_readers first so a reader who + * loaded the about-to-be-removed pointer from unified_mt.active + * has completed its try_ref before queue_remove_if drops the + * queue's ref -- otherwise the queue's drop could win, free the + * struct, and the reader's try_ref would UAF on refcount. seq_cst + * fence pairs with the matching fence in + * tidesdb_active_memtable_try_ref */ + if (db->unified_mt.immutables) + { + atomic_thread_fence(memory_order_seq_cst); + int uamr_spins = 0; + while (atomic_load_explicit(&db->unified_mt.active_mt_readers, + memory_order_acquire) > 0) + { + if (uamr_spins < TDB_IMM_SNAP_ACQUIRE_SPIN_LIMIT) + cpu_pause(); + else + cpu_yield(); + uamr_spins++; + } + queue_remove_if(db->unified_mt.immutables, tidesdb_unified_immutable_is_flushed, + NULL, tidesdb_unified_immutable_drop_queue_ref); + } + + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + continue; + } + + /* we check if CF is marked for deletion -- if so, skip processing and cleanup */ + if (cf && atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' is marked for deletion, skipping flush for SSTable %" PRIu64, + cf->name, work->sst_id); + tidesdb_immutable_memtable_unref(imm); + atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + continue; + } + + skip_list_t *memtable = imm->skip_list; + block_manager_t *wal = imm->wal; + + /* we wait for all in-flight commit-path writers to finish before reading + * the memtable. writers bump imm->writers while they mutate the WAL and + * skip list, so once this drains to zero every committed entry is visible. + * we drain writers and not refcount -- concurrent readers and iterators + * pin the immutable through refcount, and waiting on refcount would let + * sustained read load stall the flush indefinitely. readers only read the + * skip list, which is safe to do alongside the flush. + * this wait happens in the background flush thread, not the hot path */ + int drain_iterations = 0; + while (atomic_load_explicit(&imm->writers, memory_order_acquire) > 0) + { + drain_iterations++; + if (drain_iterations < TDB_REFCOUNT_DRAIN_SPIN_THRESHOLD) + { + cpu_pause(); + } + else if (drain_iterations < TDB_REFCOUNT_DRAIN_YIELD_THRESHOLD) + { + cpu_yield(); + } + else + { + usleep(TDB_REFCOUNT_DRAIN_SLEEP_US); + } + if ((drain_iterations & TDB_REFCOUNT_DRAIN_LOG_INTERVAL) == 0) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' flush worker waiting for in-flight writers to drain (current=%d)", + cf->name, atomic_load_explicit(&imm->writers, memory_order_acquire)); + } + } + atomic_thread_fence(memory_order_acquire); + + int space_check = tidesdb_check_disk_space(db, cf->directory, cf->config.min_disk_space); + if (space_check <= 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "CF '%s' encountered insufficient disk space for flush (required: %" PRIu64 + " bytes)", + cf->name, cf->config.min_disk_space); + + /* we release work and skip flush -- the memtable stays in memory */ + tidesdb_immutable_memtable_unref(imm); + atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + continue; + } + + char sst_path[MAX_FILE_PATH_LENGTH]; + snprintf(sst_path, sizeof(sst_path), "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "1", + cf->directory); + + /* once we create the sstable, we must complete the flush to avoid leaking it */ + tidesdb_sstable_t *sst = tidesdb_sstable_create(db, sst_path, work->sst_id, &cf->config); + if (!sst) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "CF '%s' SSTable %" PRIu64 " creation failed", cf->name, + work->sst_id); + + tidesdb_immutable_memtable_unref(imm); + atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + continue; + } + + /* we branch based on use_btree config */ + int write_result; + if (cf->config.use_btree) + { + write_result = tidesdb_sstable_write_from_memtable_btree(db, cf, sst, memtable); + } + else + { + write_result = tidesdb_sstable_write_from_memtable(db, cf, sst, memtable); + } + if (write_result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' SSTable %" PRIu64 " write failed (error: %d), will retry", + cf->name, work->sst_id, write_result); + + tidesdb_sstable_unref(cf->db, sst); + + usleep(TDB_FLUSH_RETRY_DELAY_US); + + /* we re-enqueue for retry (work still has valid imm reference) */ + if (queue_enqueue(cf->db->flush_queue, work) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' failed to re-enqueue flush work for retry. " + "WAL will be recovered on next open.", + cf->name); + + tidesdb_immutable_memtable_unref(imm); + atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + } + /* work re-enqueued so we keep the active_flushes slot held and the + * flush_pending counter in place, the retry will release them */ + continue; + } + + /* we must always sync sstable files regardless of sync_mode + * sstable durability is required before we can delete WAL */ + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) == TDB_SUCCESS) + { + if (bms.klog_bm) block_manager_escalate_fsync(bms.klog_bm); + if (bms.vlog_bm) block_manager_escalate_fsync(bms.vlog_bm); + } + + /* we ensure all writes are visible before making sstable discoverable */ + atomic_thread_fence(memory_order_seq_cst); + + /* we close write handles before adding to level + * readers will reopen files on-demand through tidesdb_sstable_ensure_open + * this prevents file locking issues where readers cannot open files + * that are still held open by the flush worker */ + { + /* num_open_sstables is keyed on the klog (the vlog is opened lazily and not + * separately counted), so the decrement fires iff the klog was open */ + const int had_open_bms = (sst->klog_bm != NULL); + if (sst->klog_bm) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + } + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + if (had_open_bms) + { + atomic_fetch_sub(&db->num_open_sstables, 1); + } + } + + /* we re-check marked_for_deletion after I/O -- if the CF is being dropped, + * skip level-add, manifest commit, and compaction trigger. the CF directory + * will be deleted by drop_column_family, so the sstable files are ephemeral. + * this lets drop_column_family proceed faster by clearing is_flushing sooner */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' marked for deletion after flush I/O, skipping level-add " + "for SSTable %" PRIu64, + cf->name, work->sst_id); + tidesdb_sstable_unref(cf->db, sst); + if (wal) + { + block_manager_close(wal); + imm->wal = NULL; + } + atomic_store_explicit(&imm->flushed, 1, memory_order_release); + tidesdb_immutable_memtable_unref(imm); + atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + continue; + } + + /* out-of-order L0 insertion check. concurrent flush threads finish out of id order, so a + * lower-max_seq sstable can land after a higher one. this is benign, both point reads and + * the merge-heap iterators resolve versions by per-entry seq, never by L0 array position + * (the array is append-only and unsorted). logged at DEBUG as a flush-concurrency signal + * only -- it is not a correctness violation. one line per out-of-order add (not per pair) + * to avoid an O(n) burst when an old sstable lands behind many newer ones. */ + int num_existing = atomic_load_explicit(&cf->levels[0]->num_sstables, memory_order_acquire); + if (num_existing > 0) + { + tidesdb_sstable_t **existing_ssts = + atomic_load_explicit(&cf->levels[0]->sstables, memory_order_acquire); + for (int i = 0; i < num_existing; i++) + { + if (existing_ssts[i] && existing_ssts[i]->max_seq >= sst->max_seq) + { + TDB_DEBUG_LOG(TDB_LOG_DEBUG, + "CF '%s' SSTable %" PRIu64 " (max_seq=%" PRIu64 + ") added to L0 out of seq order behind SSTable %" PRIu64 + " (max_seq=%" PRIu64 ") -- benign, reads resolve by seq", + cf->name, work->sst_id, sst->max_seq, existing_ssts[i]->id, + existing_ssts[i]->max_seq); + break; + } + } + } + + /* we add sstable to level 1 (array index 0) -- load levels atomically */ + + /* levels array is fixed, access directly */ + tidesdb_level_add_sstable(cf->levels[0], sst); + tidesdb_bump_sstable_layout_version(cf); + + atomic_thread_fence(memory_order_release); + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' flushed SSTable %" PRIu64 " (max_seq=%" PRIu64 + ") to level %d (array index 0)", + cf->name, work->sst_id, sst->max_seq, cf->levels[0]->level_num); + + /* we commit sstable to manifest before deleting WAL and before triggering compaction + * this ensures crash recovery knows which sstables are complete + * we must commit manifest before triggering compaction to avoid deadlock + * where flush worker holds manifest lock while compaction worker waits for it */ + tidesdb_manifest_add_sstable(cf->manifest, 1, work->sst_id, sst->num_entries, + sst->klog_size + sst->vlog_size); + atomic_store(&cf->manifest->sequence, atomic_load(&cf->next_sstable_id)); + int manifest_result = tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + if (manifest_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' failed to commit manifest for SSTable %" PRIu64 " (error: %d)", + cf->name, work->sst_id, manifest_result); + } + else + { + /* only mirror to the object store when the local commit succeeded -- uploading + * after a failed commit could push a manifest inconsistent with local on-disk + * state that recovery would then have to reconcile */ + tdb_objstore_upload_manifest(db, cf); + } + + /* we check file count in addition to size + * cf->levels[0] (level_num=1) is TidesDB's first disk level, equivalent to + * RocksDB's rLevel 0 in the spooky paper. this is where memtable flushes land. + * files at this level have overlapping key ranges, so reads must check all files. + * trigger compaction at α=4 files to prevent read amplification. */ + int num_l1_sstables = + atomic_load_explicit(&cf->levels[0]->num_sstables, memory_order_acquire); + size_t level1_size = + atomic_load_explicit(&cf->levels[0]->current_size, memory_order_acquire); + size_t level1_capacity = + atomic_load_explicit(&cf->levels[0]->capacity, memory_order_acquire); + + int should_compact = 0; + const char *trigger_reason = NULL; + + const int effective_file_trigger = tdb_cf_effective_l1_trigger(cf); + + /* file count trigger at level 1 */ + if (num_l1_sstables >= effective_file_trigger) + { + should_compact = 1; + trigger_reason = "file count"; + } + + else if (level1_size >= level1_capacity) + { + should_compact = 1; + trigger_reason = "size"; + } + + /*** tombstone density trigger fires when any sstable in the cf carries enough + ** tombstones that compaction should run early to push them toward the largest + * level (where regular tombstones finally drop) and shrink read-amp from + * skipping them. consulted even when a structural trigger already fired -- + * delete-heavy workloads keep the structural triggers permanently hot, so + * gating the witness behind them means it would never get a turn. on a hit + * we capture the witness sstable's key range so the response can steer a + * targeted merge at it rather than running geometry-only spooky. */ + int density_witness_level = 0; + double density_witness_value = 0.0; + int density_triggered = 0; + uint8_t *density_min_key = NULL, *density_max_key = NULL; + size_t density_min_key_size = 0, density_max_key_size = 0; + if (cf->config.tombstone_density_trigger > 0.0) + { + const uint64_t min_entries = cf->config.tombstone_density_min_entries + ? cf->config.tombstone_density_min_entries + : TDB_DEFAULT_TOMBSTONE_DENSITY_MIN_ENTRIES; + if (tidesdb_cf_dense_tombstone_witness( + cf, cf->config.tombstone_density_trigger, min_entries, &density_witness_level, + &density_witness_value, &density_min_key, &density_min_key_size, + &density_max_key, &density_max_key_size)) + { + should_compact = 1; + density_triggered = 1; + trigger_reason = "tombstone density"; + } + } + + if (should_compact) + { + if (density_witness_level > 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "CF '%s' triggering compaction (%s) witness L%d density=%.3f (threshold=%.3f)", + cf->name, trigger_reason, density_witness_level, density_witness_value, + cf->config.tombstone_density_trigger); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' level %d (first disk level) triggering compaction (%s): " + "files=%d (trigger=%d), size=%zu (capacity=%zu)", + cf->name, cf->levels[0]->level_num, trigger_reason, num_l1_sstables, + cf->config.l1_file_count_trigger, level1_size, level1_capacity); + } + + /* if the density witness fired and the dense sstable is above the + * largest level, steer a targeted merge of its key range down to the + * bottom so the regular tombstones reach where they can drop. + * otherwise (structural trigger, or already at the bottom level) run + * the geometry-driven compaction. */ + const int num_levels = + atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (density_triggered && density_witness_level > 0 && + density_witness_level < num_levels && density_min_key && density_max_key) + { + /* ownership of the key copies passes to the steer helper */ + tidesdb_compact_steer_to_bottom(cf, density_min_key, density_min_key_size, + density_max_key, density_max_key_size); + density_min_key = NULL; + density_max_key = NULL; + } + else + { + /* auto-compaction trigger -- geometry-driven, not a full merge */ + tidesdb_enqueue_compaction(cf, 0); + } + } + + /* free the witness key copies if the steer path did not take ownership */ + free(density_min_key); + free(density_max_key); + + /* we release our reference -- the level now owns it */ + tidesdb_sstable_unref(cf->db, sst); + + /* delete the WAL only once the sstable is durably recorded in the manifest. + * a failed commit leaves the sstable in-memory only (and not in the persisted + * manifest), so recovery would orphan-delete it -- retain the WAL in that case + * so recovery can replay these entries instead of losing them. the fd is closed + * either way to release the handle. */ + if (wal) + { + char *wal_path_to_delete = tdb_strdup(wal->file_path); + block_manager_close(wal); + imm->wal = NULL; + if (manifest_result == 0) + { + tdb_unlink(wal_path_to_delete); + tdb_sync_directory(cf->directory); + } + free(wal_path_to_delete); + } + + atomic_thread_fence(memory_order_seq_cst); + + atomic_store_explicit(&imm->flushed, 1, memory_order_release); + + tidesdb_immutable_memtable_unref(imm); + + /* batched cleanup only run every N flushes or when queue is large + * this reduces overhead while preventing unbounded memory growth */ + const int cleanup_threshold = TDB_IMMUTABLE_CLEANUP_THRESHOLD; + size_t max_queue_size = TDB_IMMUTABLE_MAX_QUEUE_SIZE; + size_t force_cleanup_size = TDB_IMMUTABLE_FORCE_CLEANUP_SIZE; + int counter = + atomic_fetch_add_explicit(&cf->immutable_cleanup_counter, 1, memory_order_relaxed); + size_t current_queue_size = queue_size(cf->immutable_memtables); + + int should_cleanup = + (counter % cleanup_threshold == 0) || (current_queue_size > max_queue_size); + int force_cleanup = (current_queue_size >= force_cleanup_size); + + if (force_cleanup && tdb_log_throttle(cf->db, &cf->last_imm_critical_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' immutable queue at %zu >= %zu, running cleanup (reader-pinned immutables " + "are left for a later pass)", + cf->name, current_queue_size, force_cleanup_size); + } + + /* we cleanup flushed immutables from queue if they have no active readers + * we need to keep them in queue until all reads complete to maintain MVCC correctness + * when force_cleanup is set, we block waiting for readers to finish + * + * we process items by dequeuing one at a time and immediately re-enqueuing + * items we want to keep. this ensures the queue is never fully drained, preventing + * a visibility gap where concurrent readers (tidesdb_txn_get) could see an empty + * immutable queue and skip searching immutable memtables entirely, losing data that + * hasn't been flushed to sstables yet. */ + if (should_cleanup || force_cleanup) + { + int cleaned = 0; + size_t items_to_process = queue_size(cf->immutable_memtables); + + /* we collect items to free -- we must publish snapshot (draining old readers) + * before actually freeing, to prevent use-after-free on skip_list pointers + * held by readers via the lock-free snapshot. sized to the queue depth; a + * NULL alloc just means this pass re-enqueues everything and a later pass + * reclaims it. */ + const size_t to_free_cap = items_to_process; + tidesdb_immutable_memtable_t **to_free = + to_free_cap ? malloc(to_free_cap * sizeof(*to_free)) : NULL; + int to_free_count = 0; + + for (size_t qi = 0; qi < items_to_process; qi++) + { + tidesdb_immutable_memtable_t *queued_imm = + (tidesdb_immutable_memtable_t *)queue_dequeue(cf->immutable_memtables); + if (!queued_imm) break; + + int is_flushed = atomic_load_explicit(&queued_imm->flushed, memory_order_acquire); + + /* we use atomic CAS to try claiming the last reference + * if refcount is 1, try to CAS it to 0 to claim ownership for cleanup + * if CAS succeeds, we own it and can free; if it fails, someone else ref'd it + */ + int expected_refcount = 1; + int can_cleanup = 0; + + if (is_flushed) + { + /* we try to claim the last reference atomically. this is a single, + * NON-BLOCKING attempt -- it succeeds only when refcount==1 (no reader holds a + * merge-source ref). a pinned immutable is left in the queue and reclaimed on a + * later pass once its readers drain. we must not spin-wait here, a flushed + * immutable is now excluded from the reader snapshot (see + * tidesdb_imm_snap_publish_locked), so no new reader can pin it and its + * refcount will fall to 1 on its own -- blocking the flush worker to wait for + * that is what collapsed flush throughput and wedged writes under reader load. + */ + if (atomic_compare_exchange_strong_explicit( + &queued_imm->refcount, &expected_refcount, 0, memory_order_acquire, + memory_order_relaxed)) + { + can_cleanup = 1; + } + } + + if (can_cleanup) + { + /* defer free -- we collect for post-publish cleanup */ + if (to_free && to_free_count < (int)to_free_cap) + { + to_free[to_free_count++] = queued_imm; + cleaned++; + } + else + { + /* to_free is full -- re-enqueue rather than free immediately. an + * immediate free here would bypass the publish+drain barrier below + * and could free a memtable a concurrent reader still references via + * the immutable snapshot (UAF). the next cleanup pass reclaims it. */ + queue_enqueue(cf->immutable_memtables, queued_imm); + } + } + else + { + /* keep in queue -- we immediately re-enqueue to avoid visibility gap */ + queue_enqueue(cf->immutable_memtables, queued_imm); + } + } + + if (cleaned > 0) + { + /** we republish lock-free snapshot -- non-blocking, rebuilds without + * the removed items and swaps active index immediately. + * publish + drain are held under the publisher lock as one unit so a + * concurrent publisher cannot flip the active slot between them and + * make drain wait on the wrong slot. */ + pthread_mutex_lock(&cf->imm_snap_publish_lock); + tidesdb_imm_snap_publish_locked(cf); + + /** we wait for old-slot readers to drain before freeing + * this is the only path that needs blocking drain (items being freed) */ + tidesdb_imm_snap_drain_previous(cf); + pthread_mutex_unlock(&cf->imm_snap_publish_lock); + + /* the snap drain covers readers walking the immutable snapshot; + * we also drain active_mt_readers so a reader that loaded a now + * retired pointer from cf->active_memtable (rotation moved this + * memtable to immutable, the reader still has the pre swap + * pointer) cannot UAF on try_ref's refcount load. the seq_cst + * fence pairs with the matching fence in + * tidesdb_active_memtable_try_ref between its epoch bump and + * slot load */ + atomic_thread_fence(memory_order_seq_cst); + int amr_spins = 0; + while (atomic_load_explicit(&cf->active_mt_readers, memory_order_acquire) > 0) + { + if (amr_spins < TDB_IMM_SNAP_ACQUIRE_SPIN_LIMIT) + cpu_pause(); + else + cpu_yield(); + amr_spins++; + } + + /* now safe to free -- no reader can still be accessing these */ + for (int fi = 0; fi < to_free_count; fi++) + { + if (to_free[fi]->skip_list) skip_list_free(to_free[fi]->skip_list); + if (to_free[fi]->wal) block_manager_close(to_free[fi]->wal); + free(to_free[fi]); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' cleaned up %d flushed immutable(s) with no active readers", + cf->name, cleaned); + } + + free(to_free); + } + + /* the writer cleared is_flushing after enqueue, so the worker only + * releases the active_flushes slot when its work is fully done */ + atomic_fetch_sub_explicit(&db->active_flushes, 1, memory_order_release); + free(work); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + } + + return NULL; +} + +/** + * tidesdb_compaction_work_signal_done + * signal a blocking caller that its work item has been serviced. no-op when + * the work item carries no signal (the common fire-and-forget case). + * @param work compaction work item + */ +static void tidesdb_compaction_work_signal_done(tidesdb_compaction_work_t *work) +{ + if (!work || !work->done_mu) return; + pthread_mutex_lock(work->done_mu); + atomic_store_explicit(work->done_flag, 1, memory_order_release); + pthread_cond_broadcast(work->done_cv); + pthread_mutex_unlock(work->done_mu); +} + +/** + * tidesdb_compaction_worker_thread + * worker thread that processes compaction work items from the queue + * + * this allows parallel compaction across multiple column families. + * the is_compacting flag ensures only one compaction per CF at a time, + * but multiple workers can compact different CFs concurrently. + */ +static void *tidesdb_compaction_worker_thread(void *arg) +{ + tidesdb_worker_thread_arg_t *targ = (tidesdb_worker_thread_arg_t *)arg; + tidesdb_t *db = targ->db; + char tname[TDB_THREAD_NAME_LEN]; + snprintf(tname, sizeof(tname), TDB_THREAD_PREFIX "compact.%d", targ->index); + tdb_set_thread_name(tname); + free(targ); +#ifndef _WIN32 + { + sigset_t timer_signals; + sigemptyset(&timer_signals); + sigaddset(&timer_signals, SIGALRM); + sigaddset(&timer_signals, SIGVTALRM); + sigaddset(&timer_signals, SIGPROF); + pthread_sigmask(SIG_BLOCK, &timer_signals, NULL); + } +#endif + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Compaction worker thread started"); + + while (1) + { + /* we wait for work (blocking dequeue) */ + tidesdb_compaction_work_t *work = + (tidesdb_compaction_work_t *)queue_dequeue_wait(db->compaction_queue); + + if (!work) + { + /* NULL work item signals shutdown */ + break; + } + + tidesdb_column_family_t *cf = work->cf; + + if (cf == NULL) + { + tidesdb_compaction_work_signal_done(work); + free(work); + continue; + } + + /* skip queued compaction if the CF is being dropped OR background compaction + * has been cancelled (tidesdb_cancel_background_work) -- in both cases we do + * not want to start new merge work */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire) || + atomic_load_explicit(&db->cancel_compaction, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' skipping queued compaction (drop/cancel)", + cf->name); + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + tidesdb_compaction_work_signal_done(work); + free(work); + continue; + } + + const int space_check = + tidesdb_check_disk_space(db, cf->directory, cf->config.min_disk_space); + if (space_check <= 0) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' encountered insufficient disk space for compaction (required: %" PRIu64 + " bytes)", + cf->name, cf->config.min_disk_space); + /* we clear is_compacting flag so compaction can be retried later */ + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + tidesdb_compaction_work_signal_done(work); + free(work); + continue; + } + + /* compaction pause gate -- a backup in progress blocks new compactions + * so its file copy cannot race a manifest + sstable rewrite. we park + * here holding the work item until the backup lifts the pause. */ + pthread_mutex_lock(&db->compaction_gate_lock); + while (db->compaction_paused) + { + pthread_mutex_unlock(&db->compaction_gate_lock); + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + pthread_mutex_lock(&db->compaction_gate_lock); + } + atomic_fetch_add_explicit(&db->active_compactions, 1, memory_order_acq_rel); + pthread_mutex_unlock(&db->compaction_gate_lock); + + if (work->steer_to_bottom) + { + /* tombstone-steered compaction -- targeted-merge the dense sstable's + * key range into the largest level so its regular tombstones reach + * where they can drop, instead of the geometry-driven spooky path */ + const int num_levels = + atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' tombstone-steered compaction to largest level %d", + cf->name, num_levels); + const int result = tidesdb_compact_range_internal( + cf, work->steer_min_key, work->steer_min_key_size, work->steer_max_key, + work->steer_max_key_size, num_levels - 1); + if (result != TDB_SUCCESS && result != TDB_ERR_LOCKED) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' tombstone-steered compaction failed with error %d", cf->name, + result); + } + free(work->steer_min_key); + free(work->steer_max_key); + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&db->active_compactions, 1, memory_order_acq_rel); + tidesdb_compaction_work_signal_done(work); + free(work); + continue; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Compacting CF '%s'", cf->name); + const int result = tidesdb_trigger_compaction(cf, work->full_compaction); + if (result == TDB_ERR_LOCKED) + { + /* another worker is mid-compaction on this cf. requeue this item + * without signaling so a blocking caller's intent is preserved -- + * its work runs once the holder releases is_compacting. brief + * back-off avoids a hot-loop against the lock holder */ + atomic_fetch_sub_explicit(&db->active_compactions, 1, memory_order_acq_rel); + if (queue_enqueue(db->compaction_queue, work) != 0) + { + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + tidesdb_compaction_work_signal_done(work); + free(work); + continue; + } + usleep(TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US); + continue; + } + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' compaction failed with error %d", cf->name, + result); + /* is_compacting is cleared inside tidesdb_trigger_compaction on both success and + * failure */ + } + + /* drain any auto-trigger that arrived while is_compacting was held. + * exchange-to-zero so a re-arm after this point queues another + * follow-up rather than being swallowed here */ + if (atomic_exchange_explicit(&cf->compaction_armed, 0, memory_order_acq_rel)) + tidesdb_enqueue_compaction(cf, 0); + + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&db->active_compactions, 1, memory_order_acq_rel); + tidesdb_compaction_work_signal_done(work); + free(work); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Compaction worker thread stopped"); + + return NULL; +} + +/** + * tidesdb_sync_worker_thread + * background thread that periodically escalates fsync on WAL files in + * TDB_SYNC_INTERVAL mode, both per column family WALs and the unified WAL + */ +static void *tidesdb_sync_worker_thread(void *arg) +{ + tidesdb_t *db = (tidesdb_t *)arg; + tdb_set_thread_name(TDB_THREAD_PREFIX "sync"); +#ifndef _WIN32 + { + sigset_t timer_signals; + sigemptyset(&timer_signals); + sigaddset(&timer_signals, SIGALRM); + sigaddset(&timer_signals, SIGVTALRM); + sigaddset(&timer_signals, SIGPROF); + pthread_sigmask(SIG_BLOCK, &timer_signals, NULL); + } +#endif + TDB_DEBUG_LOG(TDB_LOG_INFO, "Sync worker thread started"); + + while (atomic_load(&db->sync_thread_active)) + { + uint64_t min_interval = UINT64_MAX; + + /* we scan all CFs to find minimum sync interval */ + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + const tidesdb_column_family_t *cf = db->column_families[i]; + if (cf && cf->config.sync_mode == TDB_SYNC_INTERVAL && cf->config.sync_interval_us > 0) + { + if (cf->config.sync_interval_us < min_interval) + { + min_interval = cf->config.sync_interval_us; + } + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* the unified WAL participates in interval syncing too. its foreground + * writes are not fsynced in TDB_SYNC_INTERVAL mode, so this thread is + * the only thing that durably persists it. */ + if (db->unified_mt.enabled && db->config.unified_memtable_sync_mode == TDB_SYNC_INTERVAL) + { + uint64_t uwal_interval = db->config.unified_memtable_sync_interval_us; + if (uwal_interval == 0) uwal_interval = TDB_UNIFIED_WAL_SYNC_DEFAULT_INTERVAL_US; + if (uwal_interval < min_interval) min_interval = uwal_interval; + } + + uint64_t sleep_us; + if (min_interval == UINT64_MAX) + { + /* no CFs need interval syncing, sleep longer */ + sleep_us = TDB_NO_CF_SYNC_SLEEP_US; + } + else + { + sleep_us = min_interval; + } + + struct timespec ts; +#if defined(__linux__) + clock_gettime(CLOCK_MONOTONIC, &ts); +#else + clock_gettime(CLOCK_REALTIME, &ts); +#endif + ts.tv_sec += (time_t)(sleep_us / TDB_MICROSECONDS_PER_SECOND); + ts.tv_nsec += + (long)(sleep_us % TDB_MICROSECONDS_PER_SECOND) * TDB_NANOSECONDS_PER_MICROSECOND; + if (ts.tv_nsec >= TDB_NANOSECONDS_PER_SECOND) + { + ts.tv_sec++; + ts.tv_nsec -= TDB_NANOSECONDS_PER_SECOND; + } + + pthread_mutex_lock(&db->sync_thread_mutex); + + while (atomic_load(&db->sync_thread_active)) + { + const int wait_result = + pthread_cond_timedwait(&db->sync_thread_cond, &db->sync_thread_mutex, &ts); + + if (wait_result == ETIMEDOUT || !atomic_load(&db->sync_thread_active)) + { + break; + } + } + const int should_exit = !atomic_load(&db->sync_thread_active); + pthread_mutex_unlock(&db->sync_thread_mutex); + + if (should_exit) + { + break; + } + + if (min_interval == UINT64_MAX) + { + /* no CFs needed syncing, skip sync */ + continue; + } + + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (cf && cf->config.sync_mode == TDB_SYNC_INTERVAL && cf->config.sync_interval_us > 0) + { + /* we pin and re-confirm mt is still the active memtable. + * only immutable wals are closed by flush workers, never an + * active one, so a confirmed-active mt is safe to fsync. if it + * rotated, the rotation path already escalated the old wal. */ + tidesdb_memtable_t *mt = NULL; + if (tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, + &mt)) + { + if (mt == atomic_load(&cf->active_memtable) && mt->wal) + { + block_manager_escalate_fsync(mt->wal); + } + tidesdb_immutable_memtable_unref(mt); + } + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* escalate fsync on the unified WAL when it is in interval sync mode -- + * cf->active_memtable->wal is NULL in unified mode so the per-CF loop + * above never reaches it. */ + if (db->unified_mt.enabled && db->config.unified_memtable_sync_mode == TDB_SYNC_INTERVAL) + { + tidesdb_memtable_t *umt = NULL; + if (tidesdb_active_memtable_try_ref(&db->unified_mt.active_mt_readers, + &db->unified_mt.active, &umt)) + { + if (umt == atomic_load(&db->unified_mt.active) && umt->wal) + { + block_manager_escalate_fsync(umt->wal); + } + tidesdb_immutable_memtable_unref(umt); + } + } + + /* we check shutdown flag after sync operations to exit promptly */ + if (!atomic_load(&db->sync_thread_active)) + { + break; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Sync worker thread stopped"); + return NULL; +} + +/** + * tidesdb_replica_sync_thread + * dedicated replica-mode thread that polls the object store for new MANIFESTs + * and replays remote WALs. this work was previously done inline on the reaper + * thread, where a slow object store stalled every other reaper duty (deferred + * flush retry, memory pressure tracking, sstable eviction). a replica downloads + * and replays rather than uploads, so this thread is funded by reassigning one + * slot of the configured upload-thread budget -- the object store thread count + * is unchanged. + * @param arg pointer to the database + * @return NULL + */ +static void *tidesdb_replica_sync_thread(void *arg) +{ + tidesdb_t *db = (tidesdb_t *)arg; + + uint64_t sync_interval_us = db->config.object_store_config + ? db->config.object_store_config->replica_sync_interval_us + : TDB_REPLICA_SYNC_DEFAULT_INTERVAL_US; + if (sync_interval_us == 0) sync_interval_us = TDB_REPLICA_SYNC_DEFAULT_INTERVAL_US; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync thread created (interval=%" PRIu64 "us)", + sync_interval_us); + + while (atomic_load_explicit(&db->replica_sync_thread_active, memory_order_acquire)) + { + /* sleep the configured interval in small slices so shutdown stays prompt */ + uint64_t slept = 0; + while (slept < sync_interval_us && + atomic_load_explicit(&db->replica_sync_thread_active, memory_order_acquire)) + { + uint64_t slice = sync_interval_us - slept; + if (slice > TDB_REPLICA_SYNC_SLEEP_SLICE_US) slice = TDB_REPLICA_SYNC_SLEEP_SLICE_US; + usleep(slice); + slept += slice; + } + if (!atomic_load_explicit(&db->replica_sync_thread_active, memory_order_acquire)) break; + if (!db->object_store) continue; + + tdb_replica_sync_manifests(db); + if (db->config.object_store_config && db->config.object_store_config->replica_replay_wal) + { + tdb_objstore_replay_remote_wals(db, 0); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync thread stopped"); + return NULL; +} + +/** + * compare_sstable_candidates + * comparison function for sorting sstable candidates by last_access_time + * @param a pointer to first sstable candidate + * @param b pointer to second sstable candidate + * @return negative if a < b, positive if a > b, zero if equal + */ +static int compare_sstable_candidates(const void *a, const void *b) +{ + const time_t time_a = ((const struct { + void *sst; + time_t last_access; + } *)a) + ->last_access; + const time_t time_b = ((const struct { + void *sst; + time_t last_access; + } *)b) + ->last_access; + if (time_a < time_b) return -1; + if (time_a > time_b) return 1; + return 0; +} + +/** + * tidesdb_reaper_thread + * background maintenance thread that wakes on a timer (TDB_SSTABLE_REAPER_SLEEP_US, + * via cond_timedwait so close can wake it early) and runs a fixed sequence of + * housekeeping duties each cycle until reaper_active clears. timer signals + * (SIGALRM/SIGVTALRM/SIGPROF) are blocked so the timed wait is not restarted by the + * host process; crash and termination signals stay deliverable. + * + * per cycle, in order: + * - sweep the deferred-free list, reclaiming retired sstable arrays (serialized + * with drop_column_family via reaper_thread_mutex to avoid a UAF on a freed level) + * - retry flushes deferred when the concurrent-flush cap was hit, skipping any CF + * already at its immutable hard cap so the reaper never blocks on a drain + * - backstop compaction triggers that were coalesced (compaction_armed) but left + * with no worker to service them + * - recompute global memory pressure, sum active + immutable memtables, sstable + * bloom/index aux memory, block/btree caches and in-flight txn memory, divide by + * resolved_memory_limit, publish the level for the write path, with an OS + * free-memory safety net that can force CRITICAL + * - at HIGH/CRITICAL pressure, shed memory by force-flushing (unified rotate, or + * nuclear-flush every CF at CRITICAL, or the largest memtable at HIGH) and kick a + * non-blocking compaction on the CF holding the most sstables + * - in unified + object-store mode, enqueue an async WAL upload once the WAL has + * grown past the configured sync-threshold delta + * - reap open sstable fds when the open count exceeds the reader budget, collect + * unreferenced open sstables, sort by last access (LRU) and close the oldest + * fraction (TDB_SSTABLE_REAPER_EVICT_RATIO) back toward budget + * + * every duty is non-blocking and re-checks the shutdown flag so close drains + * promptly; long operations (compaction) are only ever triggered, never awaited. + */ +static void *tidesdb_reaper_thread(void *arg) +{ + tidesdb_t *db = (tidesdb_t *)arg; + tdb_set_thread_name(TDB_THREAD_PREFIX "reaper"); + + /* block timer signals so pthread_cond_timedwait is not repeatedly + * interrupted by the host process's timer handlers (e.g. MariaDB's + * SIGALRM). without this the futex restarts on every signal delivery + * and never times out. we only block timer-related signals to keep + * crash signals (SIGSEGV, SIGBUS, SIGABRT) and termination signals + * (SIGTERM, SIGINT) deliverable for clean shutdown and diagnostics. */ +#ifndef _WIN32 + { + sigset_t timer_signals; + sigemptyset(&timer_signals); + sigaddset(&timer_signals, SIGALRM); + sigaddset(&timer_signals, SIGVTALRM); + sigaddset(&timer_signals, SIGPROF); + pthread_sigmask(SIG_BLOCK, &timer_signals, NULL); + } +#endif + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Reaper thread started"); + + while (atomic_load(&db->reaper_active)) + { + time_t now = tdb_get_current_time(); + atomic_store_explicit(&db->cached_current_time, now, memory_order_seq_cst); + + struct timespec ts; +#if defined(__linux__) + clock_gettime(CLOCK_MONOTONIC, &ts); +#else + clock_gettime(CLOCK_REALTIME, &ts); +#endif + ts.tv_sec += (TDB_SSTABLE_REAPER_SLEEP_US / TDB_MICROSECONDS_PER_SECOND); + ts.tv_nsec += (TDB_SSTABLE_REAPER_SLEEP_US % TDB_MICROSECONDS_PER_SECOND) * + TDB_NANOSECONDS_PER_MICROSECOND; + if (ts.tv_nsec >= TDB_NANOSECONDS_PER_SECOND) + { + ts.tv_sec++; + ts.tv_nsec -= TDB_NANOSECONDS_PER_SECOND; + } + + pthread_mutex_lock(&db->reaper_thread_mutex); + + if (atomic_load(&db->reaper_active)) + { + /* return value intentionally ignored -- a timeout and a spurious + * wakeup are handled identically by re-checking the active flag */ + (void)pthread_cond_timedwait(&db->reaper_thread_cond, &db->reaper_thread_mutex, &ts); + } + int should_exit = !atomic_load(&db->reaper_active); + pthread_mutex_unlock(&db->reaper_thread_mutex); + + if (should_exit) + { + break; + } + + /* we sweep deferred free list every cycle to reclaim retired sstable arrays. + * reaper_thread_mutex serializes us with tidesdb_drop_column_family_internal's + * targeted drain -- otherwise a drop could free a level while we hold the + * stolen list with an item pointing at it, and the next iteration would + * UAF on level->array_readers */ + pthread_mutex_lock(&db->reaper_thread_mutex); + tidesdb_deferred_free_sweep(db); + pthread_mutex_unlock(&db->reaper_thread_mutex); + + /*** retry flushes that were deferred because the global concurrent-flush + ** cap was hit. the cap frees as in-flight flushes finish, so a + * deferred flush must not be left waiting for a future write to + ** re-trigger it. we collect the deferred cfs under the list lock and + *** flush them after releasing it, the same shape the memory pressure + ** victim below uses. flush_memtable_internal clears flush_deferred + * itself once a flush actually proceeds, or re-sets it if still capped + ** so a later cycle retries again. */ + { + tidesdb_column_family_t *deferred_cfs[TDB_REAPER_DEFERRED_FLUSH_BATCH]; + int deferred_count = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; + i < db->num_column_families && deferred_count < TDB_REAPER_DEFERRED_FLUSH_BATCH; + i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (cf && atomic_load_explicit(&cf->flush_deferred, memory_order_acquire)) + deferred_cfs[deferred_count++] = cf; + } + pthread_rwlock_unlock(&db->cf_list_lock); + for (int i = 0; i < deferred_count; i++) + { + /* skip a CF whose immutable queue is already at the hard cap -- + * flush_memtable_internal would usleep-block the reaper up to 5s + * (TDB_IMMUTABLE_HARD_CAP_MAX_WAIT) waiting for it to drain, + * stalling every other reaper duty. the CF stays flush_deferred, + * so a later reaper cycle retries it once flushes have drained + * the queue -- the retry polls instead of blocking. */ + if (queue_size(deferred_cfs[i]->immutable_memtables) >= + tdb_cf_immutable_hard_cap(deferred_cfs[i])) + continue; + tidesdb_flush_memtable_internal(deferred_cfs[i], 0, 1); + } + } + + /* drain any compaction triggers that were coalesced against an + * in-flight compaction. the worker that finished the compaction also + * drains the armed flag, this pass is a backstop for the case where a + * trigger arrives after the worker checked but before is_compacting + * cleared, leaving the flag set with no worker to service it */ + { + tidesdb_column_family_t *armed_cfs[TDB_REAPER_DEFERRED_FLUSH_BATCH]; + int armed_count = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; + i < db->num_column_families && armed_count < TDB_REAPER_DEFERRED_FLUSH_BATCH; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (cf && atomic_load_explicit(&cf->compaction_armed, memory_order_acquire)) + armed_cfs[armed_count++] = cf; + } + pthread_rwlock_unlock(&db->cf_list_lock); + for (int i = 0; i < armed_count; i++) + { + if (atomic_exchange_explicit(&armed_cfs[i]->compaction_armed, 0, + memory_order_acq_rel)) + tidesdb_enqueue_compaction(armed_cfs[i], 0); + } + } + + /*** global memory pressure computations + * we scan all CFs to compute total memtable + cache + bloom/index memory + * we store pressure level atomically for write path to consume + * we use explicit atomic_load to guarantee cross-thread visibility + * of test overrides and runtime changes on all compilers (MSVC, MinGW) */ + const size_t mem_limit = + atomic_load_explicit(&db->resolved_memory_limit, memory_order_acquire); + if (mem_limit > 0) + { + int64_t total_mem_bytes = 0; + + /* we track CF with most sstables for aggressive compaction */ + tidesdb_column_family_t *flush_victim = NULL; + size_t flush_victim_size = 0; + tidesdb_column_family_t *compact_victim = NULL; + int compact_victim_sst_count = 0; + + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (!cf) continue; + + /* active memtable -- exact size via atomic load (O(1)). + * we pin under the active_mt_readers epoch so the memtable + * cannot be freed by a flush worker between the load and the + * try_ref */ + tidesdb_memtable_t *mt = NULL; + if (tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, + &mt)) + { + if (mt->skip_list) + { + size_t mt_size = skip_list_get_size(mt->skip_list); + total_mem_bytes += (int64_t)mt_size; + if (mt_size > flush_victim_size && + !atomic_load_explicit(&cf->is_flushing, memory_order_relaxed)) + { + flush_victim_size = mt_size; + flush_victim = cf; + } + } + tidesdb_immutable_memtable_unref(mt); + } + + /* immutable queue -- conservative estimate using write_buffer_size. + * each immutable's data is bounded by write_buffer_size (flush threshold). + * while arena allocates write_buffer_size * 2, the unused arena capacity + * is not meaningful for pressure accounting. + * skipped in unified mode, per-CF immutable queues there hold only + * empty rotated memtables (all data is in the unified memtable, summed + * separately below), so charging each write_buffer_size is phantom + * memory that inflates the pressure ratio and triggers spurious + * force-flushes. */ + if (!db->unified_mt.enabled) + { + size_t imm_count = queue_size(cf->immutable_memtables); + total_mem_bytes += (int64_t)(imm_count * cf->config.write_buffer_size); + } + + /* count sstables per cf for the compaction victim heuristic. bloom + * filter and block index memory is not summed here -- it is tracked + * by the sstable_aux_memory_bytes running total and added once below */ + int total_cf_ssts = 0; + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + for (int lv = 0; lv < num_levels && lv < TDB_MAX_LEVELS; lv++) + { + tidesdb_level_t *lvl = cf->levels[lv]; + if (!lvl) continue; + total_cf_ssts += atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + } + + /* we estimate compaction temp memory for actively compacting CFs. + * compaction allocates merge heaps, bloom filter builders, and temp buffers. + * we use write_buffer_size as a conservative estimate per active compaction */ + if (atomic_load_explicit(&cf->is_compacting, memory_order_relaxed)) + { + total_mem_bytes += (int64_t)cf->config.write_buffer_size; + } + + /* we track CF with most sstables for compaction */ + if (total_cf_ssts > compact_victim_sst_count && + !atomic_load_explicit(&cf->is_compacting, memory_order_relaxed)) + { + compact_victim_sst_count = total_cf_ssts; + compact_victim = cf; + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* in unified memtable mode, all writes land in the unified skip list + * which is not counted by the per-CF loop above. we add it here so + * memory pressure accounting reflects actual usage. */ + if (db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = + atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (umt && umt->skip_list) + { + total_mem_bytes += (int64_t)skip_list_get_size(umt->skip_list); + } + + /* we sum each immutable's actual skip list size. a flushed + * immutable still holds its skip list resident, but most are far + * below write_buffer_size -- charging every entry the full buffer + * capacity over-reports total memory by an order of magnitude */ + if (db->unified_mt.immutables) + { + queue_t *uimm_q = db->unified_mt.immutables; + pthread_rwlock_rdlock(&uimm_q->read_lock); + for (queue_node_t *n = uimm_q->head->next; n != NULL; n = n->next) + { + tidesdb_memtable_t *uimm = (tidesdb_memtable_t *)n->data; + if (uimm && uimm->skip_list) + total_mem_bytes += (int64_t)skip_list_get_size(uimm->skip_list); + } + pthread_rwlock_unlock(&uimm_q->read_lock); + } + } + + /* bloom filter + block index memory across every sstable, maintained + * as a running total at level add and remove */ + total_mem_bytes += + atomic_load_explicit(&db->sstable_aux_memory_bytes, memory_order_relaxed); + + /* we add cache memory */ + if (db->clock_cache) + { + clock_cache_stats_t cache_stats; + clock_cache_get_stats(db->clock_cache, &cache_stats); + total_mem_bytes += (int64_t)cache_stats.total_bytes; + } + if (db->btree_node_cache) + { + clock_cache_stats_t cache_stats; + clock_cache_get_stats(db->btree_node_cache, &cache_stats); + total_mem_bytes += (int64_t)cache_stats.total_bytes; + } + + /* we include in-flight transaction memory in pressure accounting */ + { + int64_t txn_mem = atomic_load_explicit(&db->txn_memory_bytes, memory_order_relaxed); + if (txn_mem > 0) total_mem_bytes += txn_mem; + } + + atomic_store_explicit(&db->cached_memtable_bytes, total_mem_bytes, + memory_order_relaxed); + + /* we compute pressure level from ratio */ + double ratio = (double)total_mem_bytes / (double)mem_limit; + int level = TDB_MEMORY_PRESSURE_NORMAL; + if (ratio >= TDB_MEMORY_PRESSURE_CRITICAL_RATIO) + level = TDB_MEMORY_PRESSURE_CRITICAL; + else if (ratio >= TDB_MEMORY_PRESSURE_HIGH_RATIO) + level = TDB_MEMORY_PRESSURE_HIGH; + else if (ratio >= TDB_MEMORY_PRESSURE_ELEVATED_RATIO) + level = TDB_MEMORY_PRESSURE_ELEVATED; + + /* OS-level safety net -- we check real available memory every ~N seconds */ + { + if (++db->os_check_counter >= TDB_MEMORY_OS_CHECK_INTERVAL) + { + db->os_check_counter = 0; + size_t os_avail = get_available_memory(); + if (os_avail > 0 && db->total_memory > 0 && + os_avail < + (size_t)((double)db->total_memory * TDB_MEMORY_OS_CRITICAL_RATIO)) + { + if (level < TDB_MEMORY_PRESSURE_CRITICAL) + { + TDB_DEBUG_LOG( + TDB_LOG_ERROR, + "OS memory critically low %zu bytes free (%.1f%% of total) " + "-- overriding to critical pressure", + os_avail, (double)os_avail / (double)db->total_memory * 100.0); + level = TDB_MEMORY_PRESSURE_CRITICAL; + } + } + } + } + + int prev_level = + atomic_exchange_explicit(&db->memory_pressure_level, level, memory_order_release); + + /* at high or critical pressure--force-flush + aggressive compaction + * but not during shutdown -- close has already drained work and is + * joining worker threads; enqueueing new work would race with shutdown */ + if (level >= TDB_MEMORY_PRESSURE_HIGH && atomic_load(&db->is_open)) + { + if (db->unified_mt.enabled) + { + /* unified mode -- every write lands in the single shared + * unified memtable, so shedding memory means rotating THAT, + * once. the per-CF force-flushes below would only rotate + * empty per-CF memtables, they shed nothing and leave stuck + * empty immutables behind. CAS admission mirrors the rotate + * call in tidesdb_flush_memtable. */ + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&db->unified_mt.is_flushing, + &expected, 1, memory_order_acquire, + memory_order_relaxed)) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Memory pressure %s rotating unified memtable " + "(global %" PRId64 "/%zu bytes, %.1f%%)", + level >= TDB_MEMORY_PRESSURE_CRITICAL ? "CRITICAL" : "HIGH", + total_mem_bytes, mem_limit, ratio * 100.0); + tidesdb_unified_memtable_rotate(db); + atomic_store_explicit(&db->unified_mt.is_flushing, 0, memory_order_release); + } + } + else if (level >= TDB_MEMORY_PRESSURE_CRITICAL) + { + /* nuclear flush -- at critical pressure we flush every non-flushing CF + * to shed memory as fast as possible across all column families */ + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *victim = db->column_families[i]; + if (!victim) continue; + if (atomic_load_explicit(&victim->is_flushing, memory_order_relaxed)) + continue; + + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Memory pressure CRITICAL nuclear flush CF '%s' " + "(global %" PRId64 "/%zu bytes, %.1f%%)", + victim->name, total_mem_bytes, mem_limit, ratio * 100.0); + tidesdb_flush_memtable_internal(victim, 0, 1); + } + pthread_rwlock_unlock(&db->cf_list_lock); + } + else if (flush_victim) + { + /* high pressure -- force-flush the largest non-flushing memtable */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Memory pressure HIGH force-flushing CF '%s' " + "(memtable %zu bytes, global %" PRId64 "/%zu bytes, %.1f%%)", + flush_victim->name, flush_victim_size, total_mem_bytes, mem_limit, + ratio * 100.0); + tidesdb_flush_memtable_internal(flush_victim, 0, 1); + } + + /* we trigger aggressive compaction on CF with most sstables + * merging N sstables into 1 frees N-1 bloom filters + block indexes + * also produces tighter indexes and bloom filters */ + if (compact_victim && compact_victim_sst_count > 1) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Memory pressure %s triggering compaction on CF '%s' " + "(%d SSTables, most in system)", + level == TDB_MEMORY_PRESSURE_CRITICAL ? "CRITICAL" : "HIGH", + compact_victim->name, compact_victim_sst_count); + /* non-blocking -- the reaper cannot park on a multi-minute + * compaction without starving every other duty */ + tidesdb_compact_internal(compact_victim, 1, 0); + } + } + + if (level != prev_level) + { + TDB_DEBUG_LOG(level >= TDB_MEMORY_PRESSURE_HIGH ? TDB_LOG_WARN : TDB_LOG_INFO, + "Memory pressure level changed %d -> %d " + "(%.1f%% of limit, %" PRId64 " / %zu bytes)", + prev_level, level, ratio * 100.0, total_mem_bytes, mem_limit); + } + } + + /* we periodically WAL sync to object store, we read the WAL's atomic file size + * lock-free and upload when the delta since last sync exceeds the + * configured threshold. this bounds the data loss window to the write + * volume (e.g. 1MB of new data) rather than wall clock time. during + * idle periods no syncs occur. during bursts syncs fire more frequently. + * the WAL is append-only so uploading a snapshot mid-write is safe. */ + if (db->object_store && db->unified_mt.enabled) + { + size_t threshold = db->config.object_store_config + ? db->config.object_store_config->wal_sync_threshold_bytes + : 0; + if (threshold > 0) + { + /* we pin and reconfirm the active unified memtable -- only a + * rotated immutable's wal is closed by tidesdb_unified_close_wal, + * never the active one, so a confirmed-active umt is safe to read */ + tidesdb_memtable_t *umt = NULL; + if (tidesdb_active_memtable_try_ref(&db->unified_mt.active_mt_readers, + &db->unified_mt.active, &umt)) + { + if (umt == atomic_load_explicit(&db->unified_mt.active, memory_order_acquire) && + umt->wal) + { + uint64_t wal_size = atomic_load_explicit(&umt->wal->current_file_size, + memory_order_relaxed); + if (wal_size >= db->last_wal_sync_size + threshold) + { + /* enqueue on the upload worker pool instead of uploading + * inline -- a synchronous multi-MB S3 PUT here blocks the + * reaper thread, stalling deferred-flush retry, memory + * pressure tracking and sstable eviction. generation 0 + * means a plain snapshot upload -- the worker must not + * fence or delete the still-active WAL. */ + tdb_objstore_enqueue_upload(db, umt->wal->file_path, 0); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Unified WAL sync enqueued for async upload"); + db->last_wal_sync_size = wal_size; + } + } + tidesdb_immutable_memtable_unref(umt); + } + } + } + + int current_open = atomic_load(&db->num_open_sstables); + int max_open = (int)db->config.max_open_sstables; + /* evict down to the reader budget, not max_open, keeping num_open at/below the budget + * leaves the reserve free for flush/compaction and gives readers headroom to open, closing + * the [budget, max_open) starvation gap where reads back off but eviction never fired. */ + const int reap_target = tidesdb_sstable_open_budget(db); + + if (current_open < reap_target) + { + continue; /* under budget, nothing to do */ + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Reaper triggered %d open SSTables (budget %d, max %d)", + current_open, reap_target, max_open); + + /** + * sstable_candidate_t + * @param sst sstable to close + * @param last_access last access time + * collect all ssts with refcount=0 and last_access_time */ + typedef struct + { + tidesdb_sstable_t *sst; + time_t last_access; + } sstable_candidate_t; + + /* stack buffer for common case (≤N open SSTs), heap fallback for large configs */ +#define TDB_REAPER_STACK_CANDIDATES 256 + sstable_candidate_t stack_candidates[TDB_REAPER_STACK_CANDIDATES]; + sstable_candidate_t *candidates; + const int use_stack = (current_open <= TDB_REAPER_STACK_CANDIDATES); + if (use_stack) + { + candidates = stack_candidates; + } + else + { + candidates = malloc(current_open * sizeof(sstable_candidate_t)); + if (!candidates) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Reaper failed to allocate candidates array"); + continue; + } + } + + int candidate_count = 0; + + /* the candidates array is sized from current_open sampled above. flush and + * compaction workers can open more sstables while we scan, so the scan + * can find more closeable sstables than the array holds -- cap collection + * at this capacity and pick up any remainder on the next reaper cycle */ + const int candidate_capacity = use_stack ? TDB_REAPER_STACK_CANDIDATES : current_open; + + if (!atomic_load(&db->reaper_active)) + { + if (!use_stack) free(candidates); + break; + } + + /* we scan all column families for closeable ssts + * we check shutdown flag frequently to allow prompt exit on BSD systems + * where the scan loop may take longer due to scheduler behavior */ + int shutdown_requested = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families && !shutdown_requested && + candidate_count < candidate_capacity; + i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (!cf) continue; + + /* we check shutdown inside loop to exit promptly */ + if (!atomic_load(&db->reaper_active)) + { + shutdown_requested = 1; + break; + } + + int num_levels = atomic_load(&cf->num_active_levels); + for (int level = 0; level < num_levels && level < TDB_MAX_LEVELS && + candidate_count < candidate_capacity; + level++) + { + tidesdb_level_t *lvl = cf->levels[level]; + if (!lvl) continue; + + /* we load array pointer and count with careful ordering to handle concurrent + * modifications re-load count to detect concurrent remove, use minimum to avoid OOB + */ + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + + tidesdb_sstable_t **ssts = + atomic_load_explicit(&lvl->sstables, memory_order_acquire); + int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + + /* we re-load count to detect concurrent remove */ + int num_ssts_recheck = + atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + if (num_ssts_recheck < num_ssts) num_ssts = num_ssts_recheck; + + /* we verify array hasnt changed (handles add-with-resize race) */ + tidesdb_sstable_t **ssts_check = + atomic_load_explicit(&lvl->sstables, memory_order_acquire); + if (ssts_check != ssts) + { + ssts = ssts_check; + num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + } + + for (int j = 0; j < num_ssts && candidate_count < candidate_capacity; j++) + { + tidesdb_sstable_t *sst = ssts[j]; + if (!sst) continue; + + /* we only consider ssts that are open and not in use + * we use try_ref to safely acquire reference -- if it fails, sstable is being + * freed after acquiring ref, check if refcount is now 2 (level ref + our ref) + * num_open_sstables is keyed on the klog, so a klog-open sstable is + * reclaimable even when its vlog was never lazily opened */ + if (sst->klog_bm) + { + if (!tidesdb_sstable_try_ref(sst)) + { + continue; /* sstable is being freed, skip it */ + } + + /* now we check if we're the only extra ref (refcount should be 2) */ + if (atomic_load(&sst->refcount) == 2) + { + candidates[candidate_count].sst = sst; + candidates[candidate_count].last_access = + atomic_load(&sst->last_access_time); + candidate_count++; + } + else + { + /* someone else is using it, we must release our ref */ + tidesdb_sstable_unref(db, sst); + } + } + } + + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* if shutdown was requested during scan, release any acquired refs and exit */ + if (shutdown_requested) + { + for (int i = 0; i < candidate_count; i++) + { + tidesdb_sstable_unref(db, candidates[i].sst); + } + if (!use_stack) free(candidates); + break; + } + + if (!atomic_load(&db->reaper_active)) + { + if (!use_stack) free(candidates); + break; + } + + if (candidate_count == 0) + { + if (!use_stack) free(candidates); + continue; + } + + qsort(candidates, candidate_count, sizeof(sstable_candidate_t), compare_sstable_candidates); + + int to_close = (int)(candidate_count * TDB_SSTABLE_REAPER_EVICT_RATIO); + if (to_close == 0 && candidate_count > 0) to_close = 1; /* close at least 1 */ + + int closed_count = 0; + for (int i = 0; i < to_close && i < candidate_count; i++) + { + tidesdb_sstable_t *sst = candidates[i].sst; + + /*** we atomically CAS refcount from the baseline (1 original ref + 1 reaper + ** ref still held) to TDB_REFCOUNT_EVICTING (-1). this prevents concurrent + * try_ref from succeeding during the close window, fixing the TOCTOU race + *** between refcount check and close. the baseline matches the drain path's + *** "1 original + 1 work ref" semantic, so we reuse the same constant. */ + int expected = TDB_REFCOUNT_DRAIN_BASELINE; + if (sst->klog_bm && + atomic_compare_exchange_strong(&sst->refcount, &expected, TDB_REFCOUNT_EVICTING)) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + /* the vlog is opened lazily, so it may not be open; close it if it is */ + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + atomic_fetch_sub(&db->num_open_sstables, 1); + closed_count++; + + /** we restore refcount to the baseline (base ref + reaper ref still held) + * reaper will unref in the cleanup loop below */ + atomic_store(&sst->refcount, TDB_REFCOUNT_DRAIN_BASELINE); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Reaper closed %d/%d SSTables, %d now open", closed_count, + to_close, atomic_load(&db->num_open_sstables)); + + /* we release all candidate refcounts */ + for (int i = 0; i < candidate_count; i++) + { + tidesdb_sstable_unref(db, candidates[i].sst); + } + + if (!use_stack) free(candidates); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Reaper thread stopped"); + return NULL; +} + +int tidesdb_register_comparator(tidesdb_t *db, const char *name, skip_list_comparator_fn fn, + const char *ctx_str, void *ctx) +{ + if (!db || !name || !fn) return TDB_ERR_INVALID_ARGS; + if (strlen(name) >= TDB_MAX_COMPARATOR_NAME) return TDB_ERR_INVALID_ARGS; + + while (1) + { + tidesdb_comparator_entry_t *old_array = + atomic_load_explicit(&db->comparators, memory_order_acquire); + int old_count = atomic_load_explicit(&db->num_comparators, memory_order_acquire); + int old_capacity = atomic_load_explicit(&db->comparators_capacity, memory_order_acquire); + + /* we check for duplicate name */ + for (int i = 0; i < old_count; i++) + { + if (strcmp(old_array[i].name, name) == 0) + { + return TDB_ERR_INVALID_ARGS; /* duplicate name */ + } + } + + int new_capacity = old_capacity; + if (old_count >= old_capacity) + { + new_capacity = old_capacity * 2; + } + + tidesdb_comparator_entry_t *new_array = + malloc(new_capacity * sizeof(tidesdb_comparator_entry_t)); + if (!new_array) return TDB_ERR_MEMORY; + + if (old_count > 0) + { + memcpy(new_array, old_array, old_count * sizeof(tidesdb_comparator_entry_t)); + } + + tidesdb_comparator_entry_t *entry = &new_array[old_count]; + strncpy(entry->name, name, TDB_MAX_COMPARATOR_NAME - 1); + entry->name[TDB_MAX_COMPARATOR_NAME - 1] = '\0'; + entry->fn = fn; + entry->ctx = ctx; + + if (ctx_str && strlen(ctx_str) > 0) + { + strncpy(entry->ctx_str, ctx_str, TDB_MAX_COMPARATOR_CTX - 1); + entry->ctx_str[TDB_MAX_COMPARATOR_CTX - 1] = '\0'; + } + else + { + entry->ctx_str[0] = '\0'; + } + + if (atomic_compare_exchange_strong_explicit(&db->comparators, &old_array, new_array, + memory_order_release, memory_order_acquire)) + { + /* success! update count and capacity */ + atomic_store_explicit(&db->num_comparators, old_count + 1, memory_order_release); + atomic_store_explicit(&db->comparators_capacity, new_capacity, memory_order_release); + + free(old_array); + return TDB_SUCCESS; + } + + /* CAS failed, another thread modified array, retry */ + free(new_array); + } +} + +int tidesdb_get_comparator(tidesdb_t *db, const char *name, skip_list_comparator_fn *fn, void **ctx) +{ + if (!db || !name) return TDB_ERR_INVALID_ARGS; + + tidesdb_comparator_entry_t *array = + atomic_load_explicit(&db->comparators, memory_order_acquire); + int count = atomic_load_explicit(&db->num_comparators, memory_order_acquire); + + for (int i = 0; i < count; i++) + { + if (strcmp(array[i].name, name) == 0) + { + if (fn) *fn = array[i].fn; + if (ctx) *ctx = array[i].ctx; + return TDB_SUCCESS; + } + } + + return TDB_ERR_NOT_FOUND; +} + +/** + * tidesdb_ensure_btree_node_cache + * lazily create the btree node cache the first time a btree column family is + * seen. a database with no btree column family never pays for this cache, which + * matters when block_cache_size is large since clock_cache_create preallocates + * its partition slot and hash index tables. safe to call repeatedly and from + * multiple threads -- the one time creation is guarded by btree_cache_lock. + * @param db database instance + */ +static void tidesdb_ensure_btree_node_cache(tidesdb_t *db) +{ + if (!db || db->resolved_block_cache_size == 0) return; + if (db->btree_node_cache) return; /* already created -- avoid the lock */ + + pthread_mutex_lock(&db->btree_cache_lock); + if (!db->btree_node_cache) + { + db->btree_node_cache = btree_create_node_cache(db->resolved_block_cache_size); + if (db->btree_node_cache) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "B+tree node cache created on first btree column family with " + "max_bytes=%.2f MB", + (double)db->resolved_block_cache_size / (1024 * 1024)); + } + } + pthread_mutex_unlock(&db->btree_cache_lock); +} + +long tidesdb_raise_open_file_limit(long desired) +{ + return tdb_raise_max_open_files(desired); +} + +int tidesdb_open(const tidesdb_config_t *config, tidesdb_t **db) +{ + /* we auto-initialize with system allocator if not already initialized */ + tidesdb_ensure_initialized(); + + if (!config || !db) return TDB_ERR_INVALID_ARGS; + + *db = calloc(1, sizeof(tidesdb_t)); + if (!*db) + { + return TDB_ERR_MEMORY; + } + + (*db)->db_path = tdb_strdup(config->db_path); + if (!(*db)->db_path) + { + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + memcpy(&(*db)->config, config, sizeof(tidesdb_config_t)); + + /* normalize the flush pool sizing. num_flush_threads must be positive + * and max_concurrent_flushes is pinned 1:1 to it -- a higher cap is + * meaningless because the pool is the upper bound, a lower cap leaves + * workers idle, so any deviation gets a warning and is corrected. + * subsequent code reads from the owned copy via the rebind below */ + if ((*db)->config.num_flush_threads <= 0) + (*db)->config.num_flush_threads = TDB_DEFAULT_FLUSH_THREAD_POOL_SIZE; + if ((*db)->config.max_concurrent_flushes <= 0) + (*db)->config.max_concurrent_flushes = (*db)->config.num_flush_threads; + else if ((*db)->config.max_concurrent_flushes != (*db)->config.num_flush_threads) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "max_concurrent_flushes (%d) does not match num_flush_threads (%d) -- " + "pinning to num_flush_threads", + (*db)->config.max_concurrent_flushes, (*db)->config.num_flush_threads); + (*db)->config.max_concurrent_flushes = (*db)->config.num_flush_threads; + } + + /* bound the sstable fd budget to the OS open-file limit. each open sstable holds two + * descriptors; if the configured cap would need more fds than the limit can honor, opens + * fail with EMFILE under load, so clamp it down and tell the operator to raise ulimit -n. + * the reserve leaves headroom for WALs, the manifest, object-store handles, and stdio. */ + { + const long fd_limit = tdb_max_open_files(); + long fd_budget_ssts = (fd_limit - TDB_FD_RESERVE_NON_SSTABLE) / TDB_FDS_PER_SSTABLE; + if (fd_budget_ssts < TDB_MIN_OPEN_SSTABLES) fd_budget_ssts = TDB_MIN_OPEN_SSTABLES; + if ((long)(*db)->config.max_open_sstables > fd_budget_ssts) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "max_open_sstables (%zu) exceeds what the open-file limit can honor " + "(%ld sstables for fd limit %ld) -- clamping. raise the process fd " + "limit (ulimit -n) to keep more sstables open", + (*db)->config.max_open_sstables, fd_budget_ssts, fd_limit); + (*db)->config.max_open_sstables = (size_t)fd_budget_ssts; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, + "sstable fd budget set to max_open_sstables=%zu (up to %ld fds), process fd " + "limit=%ld", + (*db)->config.max_open_sstables, + (long)(*db)->config.max_open_sstables * TDB_FDS_PER_SSTABLE, fd_limit); + } + + /* subsequent reads in tidesdb_open should see the normalized values, so + * rebind the input config alias to point at the owned copy */ + config = &(*db)->config; + + /* object_store_config is a caller-owned pointer the user typically passes + * from a stack variable -- deep-copy it so the db keeps a stable view + * even after the caller's frame is gone */ + if (config->object_store_config) + { + tidesdb_objstore_config_t *owned = malloc(sizeof(tidesdb_objstore_config_t)); + if (!owned) + { + free((*db)->db_path); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + memcpy(owned, config->object_store_config, sizeof(tidesdb_objstore_config_t)); + (*db)->config.object_store_config = owned; + + /* wal_upload_sync only takes effect when replicate_wal is on -- the WAL-close path + * checks replicate_wal first, so the sync flag is silently ignored otherwise. warn + * rather than fail so an over-specified config still opens. */ + if (owned->wal_upload_sync && !owned->replicate_wal) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "object store config wal_upload_sync=1 has no effect because " + "replicate_wal=0 (WAL is not replicated); enable replicate_wal to use it"); + } + } + + /* object store mode requires unified memtable!! */ + if ((*db)->config.object_store != NULL && !(*db)->config.unified_memtable) + { + (*db)->config.unified_memtable = 1; + } + + /* we store connector reference for runtime access */ + (*db)->object_store = (*db)->config.object_store; + (*db)->local_cache = NULL; + + /* we initialize replica mode from config */ + atomic_init( + &(*db)->replica_mode, + ((*db)->config.object_store_config && (*db)->config.object_store_config->replica_mode) ? 1 + : 0); + atomic_init(&(*db)->replica_sync_thread_active, 0); + + _tidesdb_log_level = config->log_level; + + /* we initialize log file to NULL (stderr) by default. the log file globals + * are read by tidesdb_log_write under tidesdb_log_mutex, so writes here take + * the same lock to stay consistent when another db instance is logging. */ + (*db)->log_file = NULL; + pthread_mutex_lock(&tidesdb_log_mutex); + _tidesdb_log_file = NULL; + _tidesdb_log_truncate = 0; + _tidesdb_log_path[0] = '\0'; + pthread_mutex_unlock(&tidesdb_log_mutex); + + if (mkdir((*db)->db_path, TDB_DIR_PERMISSIONS) != 0 && errno != EEXIST) + { + fprintf(stderr, "Failed to create database directory %s: %s\n", (*db)->db_path, + strerror(errno)); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_IO; + } + + /* if log_to_file is enabled, open the log file in the database directory */ + if (config->log_to_file) + { + char log_path[TDB_MAX_PATH_LEN]; + snprintf(log_path, sizeof(log_path), "%s" PATH_SEPARATOR TDB_LOG_FILE, (*db)->db_path); + + (*db)->log_file = fopen(log_path, "a"); + if ((*db)->log_file) + { + /* we must set line buffering for better real-time logging */ + tdb_setlinebuf((*db)->log_file); + + /* we publish the log file globals under tidesdb_log_mutex so a + * concurrent logger never reads a half-updated file/path pair */ + pthread_mutex_lock(&tidesdb_log_mutex); + _tidesdb_log_file = (*db)->log_file; + _tidesdb_log_truncate = config->log_truncation_at; + if (_tidesdb_log_truncate > 0) + { + snprintf(_tidesdb_log_path, sizeof(_tidesdb_log_path), "%s", log_path); + } + pthread_mutex_unlock(&tidesdb_log_mutex); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to open log file %s, falling back to default.", + log_path); + } + } + + const char *level_names[] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL", "NONE"}; + const char *level_str = + (_tidesdb_log_level >= TDB_LOG_DEBUG && _tidesdb_log_level <= TDB_LOG_FATAL) + ? level_names[_tidesdb_log_level] + : (_tidesdb_log_level == TDB_LOG_NONE ? "NONE" : "UNKNOWN"); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Opening TidesDB with path=%s, log_level=%s, workers=%d%s", + config->db_path, level_str, config->num_compaction_threads, + config->log_to_file ? ", logging to file" : ""); + + char lock_path[TDB_MAX_PATH_LEN]; + snprintf(lock_path, sizeof(lock_path), "%s" PATH_SEPARATOR TDB_LOCK_FILE, (*db)->db_path); + + int lock_result; + (*db)->lock_fd = tdb_open_lock_file(lock_path, &lock_result); + if ((*db)->lock_fd < 0) + { + if (lock_result == TDB_LOCK_HELD) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Database is locked by another process. Only one process can open a " + "database directory at a time."); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to open lock file: %s", lock_path); + } + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return (lock_result == TDB_LOCK_HELD) ? TDB_ERR_LOCKED : TDB_ERR_IO; + } + + lock_result = tdb_file_lock_exclusive((*db)->lock_fd, TDB_LOCK_DEFAULT_RETRIES); + if (lock_result != TDB_LOCK_SUCCESS) + { + if (lock_result == TDB_LOCK_HELD) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Database is locked by another process. Only one process can open a " + "database directory at a time."); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Failed to acquire database lock due to an irrecoverable error."); + } + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return (lock_result == TDB_LOCK_HELD) ? TDB_ERR_LOCKED : TDB_ERR_IO; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Acquired exclusive lock on database directory"); + + (*db)->cf_capacity = TDB_INITIAL_CF_CAPACITY; + tidesdb_column_family_t **cfs = calloc((*db)->cf_capacity, sizeof(tidesdb_column_family_t *)); + if (!cfs) + { + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + (*db)->column_families = cfs; + (*db)->num_column_families = 0; + + atomic_init(&(*db)->is_open, 0); + atomic_init(&(*db)->cancel_compaction, 0); + atomic_init(&(*db)->is_recovering, 1); + + if (pthread_rwlock_init(&(*db)->cf_list_lock, NULL) != 0) + { + free(cfs); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + /* initialized before recovery -- a recovered btree column family triggers + * lazy creation of btree_node_cache, which takes this lock */ + pthread_mutex_init(&(*db)->btree_cache_lock, NULL); + + pthread_mutex_init(&(*db)->compaction_gate_lock, NULL); + (*db)->compaction_paused = 0; + atomic_init(&(*db)->active_compactions, 0); + + tidesdb_comparator_entry_t *initial_comparators = + calloc(TDB_INITIAL_COMPARATOR_CAPACITY, sizeof(tidesdb_comparator_entry_t)); + if (!initial_comparators) + { + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + atomic_init(&(*db)->comparators, initial_comparators); + atomic_init(&(*db)->num_comparators, 0); + atomic_init(&(*db)->comparators_capacity, TDB_INITIAL_COMPARATOR_CAPACITY); + + tidesdb_register_comparator(*db, "memcmp", skip_list_comparator_memcmp, NULL, NULL); + tidesdb_register_comparator(*db, "lexicographic", tidesdb_comparator_lexicographic, NULL, NULL); + tidesdb_register_comparator(*db, "uint64", tidesdb_comparator_uint64, NULL, NULL); + tidesdb_register_comparator(*db, "int64", tidesdb_comparator_int64, NULL, NULL); + tidesdb_register_comparator(*db, "reverse", tidesdb_comparator_reverse_memcmp, NULL, NULL); + tidesdb_register_comparator(*db, "case_insensitive", tidesdb_comparator_case_insensitive, NULL, + NULL); + + (*db)->flush_queue = queue_new(); + (*db)->compaction_queue = queue_new(); + /* sub-compaction helper-thread budget-- a parallel compaction round borrows up to this + * many ephemeral helpers, so total sub-merge threads across CFs stay within the pool */ + atomic_init(&(*db)->compaction_helper_budget, config->num_compaction_threads); + + if (!(*db)->flush_queue || !(*db)->compaction_queue) + { + if ((*db)->flush_queue) queue_free((*db)->flush_queue); + if ((*db)->compaction_queue) queue_free((*db)->compaction_queue); + free(initial_comparators); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + atomic_init(&(*db)->next_txn_id, 1); + atomic_init(&(*db)->global_seq, 1); + atomic_init(&(*db)->num_open_sstables, 0); + + (*db)->commit_status = tidesdb_commit_status_create(); + if (!(*db)->commit_status) + { + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + if (pthread_rwlock_init(&(*db)->active_txns_lock, NULL) != 0) + { + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + /* we start with larger capacity to avoid realloc under lock */ + (*db)->active_txns_capacity = TDB_ACTIVE_TXN_INITIAL_CAPACITY; + (*db)->active_txns = calloc((*db)->active_txns_capacity, sizeof(tidesdb_txn_t *)); + if (!(*db)->active_txns) + { + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + (*db)->num_active_txns = 0; + + uint64_t initial_space = 0; + if (tdb_get_available_disk_space((*db)->db_path, &initial_space) == 0) + { + atomic_init(&(*db)->cached_available_disk_space, initial_space); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Initial available disk space is %" PRIu64 " bytes", + initial_space); + } + else + { + /* failed to get disk space, set to 0 to trigger checks */ + atomic_init(&(*db)->cached_available_disk_space, 0); + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to get initial disk space"); + } + atomic_init(&(*db)->last_disk_space_check, time(NULL)); + + (*db)->total_memory = get_total_memory(); + (*db)->available_memory = get_available_memory(); + if ((*db)->total_memory > 0 && (*db)->available_memory > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "System memory is total=%" PRIu64 " bytes, available=%" PRIu64 " bytes", + (uint64_t)(*db)->total_memory, (uint64_t)(*db)->available_memory); + + /* resolve global memory limit */ + size_t min_limit = (size_t)((double)(*db)->total_memory * TDB_MEMORY_MIN_LIMIT_RATIO); + if (config->max_memory_usage > 0) + { + (*db)->resolved_memory_limit = config->max_memory_usage; + if ((*db)->resolved_memory_limit < min_limit) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "max_memory_usage %zu bytes (%.2f MB) is below minimum " + "%.0f%% of total RAM (%zu bytes, %.2f MB) -- clamping to minimum", + (*db)->resolved_memory_limit, + (double)(*db)->resolved_memory_limit / (1024.0 * 1024.0), + TDB_MEMORY_MIN_LIMIT_RATIO * 100.0, min_limit, + (double)min_limit / (1024.0 * 1024.0)); + (*db)->resolved_memory_limit = min_limit; + } + } + else + { + (*db)->resolved_memory_limit = + (size_t)((double)(*db)->total_memory * TDB_MEMORY_AUTO_LIMIT_RATIO); + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "Resolved memory limit %zu bytes (%.2f MB)", + (*db)->resolved_memory_limit, + (double)(*db)->resolved_memory_limit / (1024.0 * 1024.0)); + + /* push the single-block memory-safety budget down to the block manager so + * the read path can refuse an oversized block via a pure atomic load */ + block_manager_set_max_safe_block_bytes((*db)->resolved_memory_limit / + TDB_MEMORY_MAX_BLOCK_FRACTION_DENOM); + + atomic_init(&(*db)->cached_memtable_bytes, 0); + atomic_init(&(*db)->txn_memory_bytes, 0); + atomic_init(&(*db)->memory_pressure_level, TDB_MEMORY_PRESSURE_NORMAL); + atomic_init(&(*db)->flush_pending_count, 0); + atomic_init(&(*db)->active_flushes, 0); + atomic_init(&(*db)->flush_heartbeat, 0); + (*db)->os_check_counter = 0; + } + else + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to get system memory information"); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + /* we validate total cache size against resolved_memory_limit to prevent + * pathological configs where caches alone consume the entire memory budget. + * both clock_cache and btree_node_cache use block_cache_size, so total is 2x */ + size_t effective_block_cache_size = config->block_cache_size; + if (effective_block_cache_size > 0) + { + const size_t total_cache = effective_block_cache_size * TDB_BLOCK_CACHE_INSTANCES; + const size_t mem_limit = (*db)->resolved_memory_limit; + if (mem_limit > 0 && total_cache > mem_limit) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "block_cache_size %zu (total cache %zu bytes with btree cache) " + "exceeds resolved_memory_limit %zu bytes -- clamping", + effective_block_cache_size, total_cache, mem_limit); + /* we clamp so both caches together use at most TDB_BLOCK_CACHE_MEM_FRACTION + * of the memory limit, leaving headroom for memtables, bloom filters, and + * write ops */ + effective_block_cache_size = (size_t)((double)mem_limit * TDB_BLOCK_CACHE_MEM_FRACTION / + (double)TDB_BLOCK_CACHE_INSTANCES); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Clamped block_cache_size to %zu bytes", + effective_block_cache_size); + } + } + + if (effective_block_cache_size > 0) + { + cache_config_t cache_config = {0}; + clock_cache_compute_config(effective_block_cache_size, &cache_config); + cache_config.evict_callback = tidesdb_cache_evict_block; /* ref-counted block cleanup */ + + (*db)->clock_cache = clock_cache_create(&cache_config); + if (!(*db)->clock_cache) + { + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "Block clock cache created with max_bytes=%.2f MB", + (double)effective_block_cache_size / (1024 * 1024)); + } + else + { + (*db)->clock_cache = NULL; + TDB_DEBUG_LOG(TDB_LOG_INFO, "Block clock cache disabled (block_cache_size=0)"); + } + + /* the btree node cache is created lazily on the first btree column family + * (see tidesdb_ensure_btree_node_cache) -- a database with no btree column + * family must not preallocate it, which for a large block_cache_size is a + * significant amount of wasted slot and hash index memory */ + (*db)->btree_node_cache = NULL; + (*db)->resolved_block_cache_size = effective_block_cache_size; + + /*** we initialize cached_current_time before recovery so skip lists created during + ** recovery have a valid time pointer for TTL checks + * use seq_cst for strongest memory ordering on all platforms */ + atomic_store_explicit(&(*db)->cached_current_time, tdb_get_current_time(), + memory_order_seq_cst); + + /** we initialize unified memtable state (use (*db)->config which may have been + * modified by object store enforcement above, not the original config pointer) */ + (*db)->unified_mt.enabled = (*db)->config.unified_memtable; + if ((*db)->unified_mt.enabled) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified memtable mode enabled"); + + (*db)->unified_mt.write_buffer_size = config->unified_memtable_write_buffer_size > 0 + ? config->unified_memtable_write_buffer_size + : TDB_DEFAULT_WRITE_BUFFER_SIZE; + + (*db)->unified_mt.immutables = queue_new(); + if (!(*db)->unified_mt.immutables) + { + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + if ((*db)->clock_cache) clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + atomic_init(&(*db)->unified_mt.active_mt_readers, 0); + atomic_init(&(*db)->unified_mt.is_flushing, 0); + atomic_init(&(*db)->unified_mt.immutable_cleanup_counter, 0); + atomic_init(&(*db)->unified_mt.next_cf_index, 0); + atomic_init(&(*db)->unified_mt.wal_generation, 0); + /* we resolve skip list config with defaults */ + const int umt_max_level = config->unified_memtable_skip_list_max_level > 0 + ? config->unified_memtable_skip_list_max_level + : TDB_SKIP_LIST_MAX_LEVEL; + const float umt_probability = config->unified_memtable_skip_list_probability > 0.0f + ? config->unified_memtable_skip_list_probability + : TDB_SKIP_LIST_PROBABILITY; + /* the unified WAL is opened without block-manager self-sync; durability is owned by + * the commit-path group fsync (FULL) or the sync worker (INTERVAL) */ + const int umt_sync_mode = BLOCK_MANAGER_SYNC_NONE; + + /* we create the initial unified skip_list + WAL */ + skip_list_t *umt_sl = NULL; + if (skip_list_new_with_arena(&umt_sl, umt_max_level, umt_probability, + skip_list_comparator_memcmp, NULL, &(*db)->cached_current_time, + (*db)->unified_mt.write_buffer_size * 2) != 0) + { + queue_free((*db)->unified_mt.immutables); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + if ((*db)->clock_cache) clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + /*** the active unified memtable's wal is the highest-generation + ** uwal_*.log in db_path -- rotation always allocates a strictly higher + * generation, so on a crash-reopen the highest existing file is the + ** wal that was active. we adopt it -- open without truncating, + *** validate to trim any preallocation tail -- so recovery can replay it + ** in place. lower-generation uwals are recovered separately. a fresh + * db has no uwal files, so we fall back to creating uwal_0.log. */ + uint64_t active_uwal_gen = 0; + int have_existing_uwal = 0; + DIR *uwal_scan = opendir((*db)->db_path); + if (uwal_scan) + { + struct dirent *ue; + while ((ue = readdir(uwal_scan)) != NULL) + { + uint64_t ugen = 0; + if (tdb_parse_unified_wal_gen(ue->d_name, &ugen)) + { + if (!have_existing_uwal || ugen > active_uwal_gen) + { + active_uwal_gen = ugen; + have_existing_uwal = 1; + } + } + } + closedir(uwal_scan); + } + + char uwal_path[TDB_MAX_PATH_LEN]; + snprintf(uwal_path, sizeof(uwal_path), + "%s" PATH_SEPARATOR TDB_UNIFIED_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, (*db)->db_path, + TDB_U64_CAST(active_uwal_gen)); + + block_manager_t *uwal = NULL; + int uwal_open_failed = (block_manager_open(&uwal, uwal_path, umt_sync_mode) != 0); + if (!uwal_open_failed) + { + /* adopt an existing uwal -- validate (permissive) to trim the + * preallocation tail; a fresh db's uwal_0.log gets truncated empty */ + if (have_existing_uwal) + uwal_open_failed = (block_manager_validate_last_block( + uwal, BLOCK_MANAGER_PERMISSIVE_BLOCK_VALIDATION) != 0); + else + uwal_open_failed = (block_manager_truncate(uwal) != 0); + } + if (uwal_open_failed) + { + if (uwal) block_manager_close(uwal); + skip_list_free(umt_sl); + queue_free((*db)->unified_mt.immutables); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + if ((*db)->clock_cache) clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_IO; + } + + tidesdb_memtable_t *umt = malloc(sizeof(tidesdb_memtable_t)); + if (!umt) + { + block_manager_close(uwal); + skip_list_free(umt_sl); + queue_free((*db)->unified_mt.immutables); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + if ((*db)->clock_cache) clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((void *)(*db)->config.object_store_config); + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + umt->skip_list = umt_sl; + umt->wal = uwal; + umt->id = 0; + /* generation matches the adopted uwal file; wal_generation tracks it so + * the next rotation allocates uwal_ and never collides */ + umt->generation = active_uwal_gen; + atomic_init(&umt->refcount, 1); + atomic_init(&umt->writers, 0); + atomic_init(&umt->flushed, 0); + atomic_init(&(*db)->unified_mt.active, umt); + atomic_store_explicit(&(*db)->unified_mt.wal_generation, active_uwal_gen, + memory_order_relaxed); + + /* the unified cf index map keeps each cf's key prefix stable across + * reopen. without it a crash-reopen reassigns indexes by directory + * scan order and unified wal recovery replays under the wrong cf. */ + (*db)->unified_mt.cf_index_map = NULL; + (*db)->unified_mt.cf_index_map_count = 0; + (*db)->unified_mt.cf_index_map_capacity = 0; + pthread_mutex_init(&(*db)->unified_mt.cf_index_map_lock, NULL); + pthread_mutex_init(&(*db)->unified_mt.wal_group_sync_lock, NULL); + pthread_cond_init(&(*db)->unified_mt.wal_group_sync_cond, NULL); + /* a cold-started node (no local UNIMAP) pulls the map from the object + * store so its cf indexes match the primary that wrote the uploaded + * unified wal; a node with a local map keeps its own */ + tidesdb_unimap_objstore_pull(*db, 0); + if (tidesdb_unimap_load(*db) != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Failed to load UNIMAP, unified cf indexes may be reassigned"); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified memtable initialized (write_buffer=%zu bytes, WAL=%s)", + (*db)->unified_mt.write_buffer_size, uwal_path); + } + else + { + memset(&(*db)->unified_mt, 0, sizeof((*db)->unified_mt)); + } + + int rc = tidesdb_recover_database(*db); + if (rc != TDB_SUCCESS) + { + if ((*db)->unified_mt.enabled) + { + tidesdb_memtable_t *umt = atomic_load(&(*db)->unified_mt.active); + if (umt) + { + if (umt->skip_list) skip_list_free(umt->skip_list); + if (umt->wal) block_manager_close(umt->wal); + free(umt); + } + queue_free((*db)->unified_mt.immutables); + tidesdb_unimap_free(*db); + } + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + if ((*db)->clock_cache) clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return rc; + } + + (*db)->flush_threads = malloc(config->num_flush_threads * sizeof(pthread_t)); + if (!(*db)->flush_threads) + { + clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + for (int i = 0; i < config->num_flush_threads; i++) + { + tidesdb_worker_thread_arg_t *flush_arg = malloc(sizeof(tidesdb_worker_thread_arg_t)); + if (!flush_arg) + { + queue_shutdown((*db)->flush_queue); + for (int j = 0; j < i; j++) pthread_join((*db)->flush_threads[j], NULL); + free((*db)->flush_threads); + clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + flush_arg->db = *db; + flush_arg->index = i; + if (pthread_create(&(*db)->flush_threads[i], NULL, tidesdb_flush_worker_thread, + flush_arg) != 0) + { + free(flush_arg); + queue_shutdown((*db)->flush_queue); + for (int j = 0; j < i; j++) + { + pthread_join((*db)->flush_threads[j], NULL); + } + free((*db)->flush_threads); + clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + } + + (*db)->compaction_threads = malloc(config->num_compaction_threads * sizeof(pthread_t)); + if (!(*db)->compaction_threads) + { + queue_shutdown((*db)->flush_queue); + for (int i = 0; i < config->num_flush_threads; i++) + { + pthread_join((*db)->flush_threads[i], NULL); + } + free((*db)->flush_threads); + clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + + for (int i = 0; i < config->num_compaction_threads; i++) + { + tidesdb_worker_thread_arg_t *compact_arg = malloc(sizeof(tidesdb_worker_thread_arg_t)); + if (!compact_arg) + { + queue_shutdown((*db)->compaction_queue); + for (int j = 0; j < i; j++) pthread_join((*db)->compaction_threads[j], NULL); + free((*db)->compaction_threads); + queue_shutdown((*db)->flush_queue); + for (int k = 0; k < config->num_flush_threads; k++) + pthread_join((*db)->flush_threads[k], NULL); + free((*db)->flush_threads); + clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + compact_arg->db = *db; + compact_arg->index = i; + if (pthread_create(&(*db)->compaction_threads[i], NULL, tidesdb_compaction_worker_thread, + compact_arg) != 0) + { + free(compact_arg); + queue_shutdown((*db)->compaction_queue); + for (int j = 0; j < i; j++) + { + pthread_join((*db)->compaction_threads[j], NULL); + } + free((*db)->compaction_threads); + + queue_shutdown((*db)->flush_queue); + for (int k = 0; k < config->num_flush_threads; k++) + { + pthread_join((*db)->flush_threads[k], NULL); + } + free((*db)->flush_threads); + clock_cache_destroy((*db)->clock_cache); + if ((*db)->btree_node_cache) clock_cache_destroy((*db)->btree_node_cache); + free((*db)->active_txns); + pthread_rwlock_destroy(&(*db)->active_txns_lock); + tidesdb_commit_status_destroy((*db)->commit_status); + queue_free((*db)->flush_queue); + queue_free((*db)->compaction_queue); + free(atomic_load(&(*db)->comparators)); + pthread_rwlock_destroy(&(*db)->cf_list_lock); + free((*db)->column_families); + tdb_file_unlock((*db)->lock_fd); + close((*db)->lock_fd); + free((*db)->db_path); + free((void *)(*db)->config.object_store_config); + if ((*db)->unified_mt.enabled) + { + pthread_mutex_destroy(&(*db)->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&(*db)->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&(*db)->unified_mt.wal_group_sync_cond); + } + free(*db); + *db = NULL; + return TDB_ERR_MEMORY; + } + } + + /* we check if any CF needs interval syncing and start sync thread if needed */ + int needs_sync_thread = 0; + pthread_rwlock_rdlock(&(*db)->cf_list_lock); + for (int i = 0; i < (*db)->num_column_families; i++) + { + if ((*db)->column_families[i] && + (*db)->column_families[i]->config.sync_mode == TDB_SYNC_INTERVAL && + (*db)->column_families[i]->config.sync_interval_us > 0) + { + needs_sync_thread = 1; + break; + } + } + pthread_rwlock_unlock(&(*db)->cf_list_lock); + + /* the unified WAL in interval sync mode also needs the sync worker */ + if ((*db)->unified_mt.enabled && (*db)->config.unified_memtable_sync_mode == TDB_SYNC_INTERVAL) + { + needs_sync_thread = 1; + } + + pthread_mutex_init(&(*db)->sync_thread_mutex, NULL); +#if defined(__linux__) + { + pthread_condattr_t cattr; + pthread_condattr_init(&cattr); + pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC); + pthread_cond_init(&(*db)->sync_thread_cond, &cattr); + pthread_condattr_destroy(&cattr); + } +#else + pthread_cond_init(&(*db)->sync_thread_cond, NULL); +#endif + + /* create the btree node cache now if any recovered column family uses btree */ + for (int i = 0; i < (*db)->num_column_families; i++) + { + tidesdb_column_family_t *bcf = (*db)->column_families[i]; + if (bcf && bcf->config.use_btree) + { + tidesdb_ensure_btree_node_cache(*db); + break; + } + } + + if (needs_sync_thread && !atomic_load(&(*db)->sync_thread_active)) + { + /* we only start if not already started during recovery by tidesdb_create_column_family */ + atomic_store(&(*db)->sync_thread_active, 1); + if (pthread_create(&(*db)->sync_thread, NULL, tidesdb_sync_worker_thread, *db) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "Failed to create sync worker thread -- cannot honor sync_interval_us " + "durability guarantee, refusing to open"); + atomic_store(&(*db)->sync_thread_active, 0); + /* tidesdb_close destroys sync_thread_mutex and sync_thread_cond + * unconditionally -- destroying them here too would double destroy */ + tidesdb_close(*db); + *db = NULL; + return TDB_ERR_IO; + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Sync worker thread created"); + } + } + else if (!needs_sync_thread && !atomic_load(&(*db)->sync_thread_active)) + { + atomic_store(&(*db)->sync_thread_active, 0); + } + + pthread_mutex_init(&(*db)->reaper_thread_mutex, NULL); +#if defined(__linux__) + { + pthread_condattr_t cattr; + pthread_condattr_init(&cattr); + pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC); + pthread_cond_init(&(*db)->reaper_thread_cond, &cattr); + pthread_condattr_destroy(&cattr); + } +#else + pthread_cond_init(&(*db)->reaper_thread_cond, NULL); +#endif + atomic_init(&(*db)->deferred_free_list, NULL); + + atomic_store(&(*db)->reaper_active, 1); + if (pthread_create(&(*db)->reaper_thread, NULL, tidesdb_reaper_thread, *db) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create reaper thread"); + atomic_store(&(*db)->reaper_active, 0); + pthread_mutex_destroy(&(*db)->reaper_thread_mutex); + pthread_cond_destroy(&(*db)->reaper_thread_cond); + /* non-fatal, continue without reaper thread */ + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Reaper thread created"); + } + + /* we initialize local file cache for object store mode */ + if ((*db)->object_store) + { + const char *cache_dir = ((*db)->config.object_store_config && + (*db)->config.object_store_config->local_cache_path) + ? (*db)->config.object_store_config->local_cache_path + : (*db)->db_path; + size_t cache_max = (*db)->config.object_store_config + ? (*db)->config.object_store_config->local_cache_max_bytes + : 0; + + (*db)->local_cache = calloc(1, sizeof(tdb_local_cache_t)); + if ((*db)->local_cache) + { + tdb_local_cache_init((*db)->local_cache, cache_dir, cache_max); + } + + /* we initialize async upload pipeline. + * in replica mode one slot of the configured upload budget funds the + * dedicated replica sync thread instead of an upload worker. a replica + * downloads and replays rather than uploads, so its upload pool is + * otherwise near-idle and the object store thread count is unchanged. + * the budget is floored so an applicable (replica) config always keeps + * at least two object store threads, one upload worker and one sync. */ + int num_upload_threads = (*db)->config.object_store_config + ? (*db)->config.object_store_config->max_concurrent_uploads + : 4; + if (num_upload_threads <= 0) num_upload_threads = 4; + + const int replica = atomic_load_explicit(&(*db)->replica_mode, memory_order_acquire); + if (replica && num_upload_threads > 1) num_upload_threads -= 1; + + (*db)->upload_queue = queue_new(); + atomic_init(&(*db)->last_uploaded_gen, 0); + atomic_init(&(*db)->total_uploads, 0); + atomic_init(&(*db)->total_upload_failures, 0); + (*db)->last_wal_sync_size = 0; + + if ((*db)->upload_queue) + { + (*db)->upload_threads = calloc(num_upload_threads, sizeof(pthread_t)); + if ((*db)->upload_threads) + { + /* count only the threads that actually start -- close joins + * num_upload_threads of them, and joining a never-created (zeroed) + * pthread_t is undefined behaviour */ + int created = 0; + for (int i = 0; i < num_upload_threads; i++) + { + if (pthread_create(&(*db)->upload_threads[created], NULL, + tdb_upload_worker_thread, *db) == 0) + created++; + } + (*db)->num_upload_threads = created; + if (created == 0) + { + free((*db)->upload_threads); + (*db)->upload_threads = NULL; + } + } + } + + /* replica mode -- spawn the dedicated MANIFEST/WAL sync thread that + * replaces the reaper's old inline (blocking) replica sync. */ + if (replica) + { + atomic_store(&(*db)->replica_sync_thread_active, 1); + if (pthread_create(&(*db)->replica_sync_thread, NULL, tidesdb_replica_sync_thread, + *db) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create replica sync thread"); + atomic_store(&(*db)->replica_sync_thread_active, 0); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Object store mode enabled (connector=%s, cache_dir=%s, " + "upload_threads=%d, replica=%d)", + tidesdb_objstore_backend_name((*db)->object_store->backend), cache_dir, + num_upload_threads, replica); + } + + atomic_store(&(*db)->is_open, 1); + atomic_store(&(*db)->is_recovering, 0); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Database is now open and ready for operations"); + + return TDB_SUCCESS; +} + +int tidesdb_close(tidesdb_t *db) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Closing TidesDB at path %s", db->db_path); + atomic_store(&db->is_open, 0); + + /* we flush unified active memtable before close */ + if (db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = + atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + /* never flush on a replica close. the active memtable holds only remote-WAL-replay + * entries (transient, re-replayed on next open); flushing them creates an sstable + + * compaction that diverges from the primary's manifest. the upload gate already blocks + * the push, but skipping the flush also avoids the pointless local churn. */ + if (!atomic_load_explicit(&db->replica_mode, memory_order_acquire) && umt && + umt->skip_list && skip_list_count_entries(umt->skip_list) > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flushing unified active memtable before close"); + tidesdb_unified_flush_immutable(db, umt); + } + + /*** the same memtable pointer lives in unified_mt.immutables (for read scans) + ** and in a flush_queue work item (for the worker). dequeueing from + * immutables does not remove the worker's reference, so calling + *** tidesdb_unified_flush_immutable here without first letting the worker + ** finish would race the worker into block_manager_close on the same WAL. + * wait for the worker to drain before draining immutables. */ + if (db->flush_queue) + { + while (1) + { + size_t qsize = queue_size(db->flush_queue); + int pending = atomic_load_explicit(&db->flush_pending_count, memory_order_acquire); + if (qsize == 0 && pending <= 0) break; + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + } + } + + /* we drain unified immutable queue */ + if (db->unified_mt.immutables) + { + tidesdb_memtable_t *uimm; + while ((uimm = (tidesdb_memtable_t *)queue_dequeue(db->unified_mt.immutables)) != NULL) + { + if (!atomic_load_explicit(&uimm->flushed, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flushing unified immutable memtable before close"); + tidesdb_unified_flush_immutable(db, uimm); + } + /* drop the queue's ref -- unref frees the skip list, wal and + * struct (all readers have stopped by the time close drains) */ + tidesdb_immutable_memtable_unref(uimm); + } + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flushing all active memtables before close"); + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i]) + { + tidesdb_column_family_t *cf = db->column_families[i]; + + /* we wait for any in-progress flush to complete */ + int wait_count = 0; + while (tidesdb_is_flushing(cf) && wait_count < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + wait_count++; + if (wait_count % 10 == 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "CF '%s' is waiting for in-progress flush to complete (waited %dms)", + cf->name, wait_count * 10); + } + } + + tidesdb_memtable_t *mt = + atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + int entry_count = (mt && mt->skip_list) ? skip_list_count_entries(mt->skip_list) : 0; + + if (entry_count > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' is flushing %d entries before close", cf->name, + entry_count); + + /* we retry flush with backoff to prevent data loss */ + int flush_result = TDB_ERR_UNKNOWN; + int retry_count = 0; + + while (retry_count < TDB_MAX_FFLUSH_RETRY_ATTEMPTS) + { + flush_result = tidesdb_flush_memtable_internal(cf, 0, 1); /* force flush */ + if (flush_result == TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' flush before close succeeded", + cf->name); + break; + } + + retry_count++; + if (retry_count < TDB_MAX_FFLUSH_RETRY_ATTEMPTS) + { + TDB_DEBUG_LOG( + TDB_LOG_ERROR, + "CF '%s' flush before close failed (attempt %d/%d, error %d), " + "retrying", + cf->name, retry_count, TDB_MAX_FFLUSH_RETRY_ATTEMPTS, flush_result); + usleep(TDB_FLUSH_RETRY_BACKOFF_US * + retry_count); /* linear backoff -- TDB_FLUSH_RETRY_BACKOFF_US * N */ + } + } + + if (flush_result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' flush before close failed after %d attempts (error " + "%d). " + "Data is persisted in WAL and will be recovered on next open.", + cf->name, TDB_MAX_FFLUSH_RETRY_ATTEMPTS, flush_result); + } + } + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + TDB_DEBUG_LOG(TDB_LOG_INFO, "All memtables flushed"); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Waiting for background flushes to complete"); + int flush_wait_count = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + while (1) + { + int any_flushing = 0; + size_t queue_size_val = 0; + + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i]) + { + if (atomic_load_explicit(&db->column_families[i]->is_flushing, + memory_order_acquire)) + { + any_flushing = 1; + break; + } + } + } + + /* we also check if flush queue has pending work */ + if (db->flush_queue) + { + queue_size_val = queue_size(db->flush_queue); + } + + /* we check all conditions -- no CF admission flag, queue empty, no pending flush I/O */ + int pending = atomic_load_explicit(&db->flush_pending_count, memory_order_acquire); + if (!any_flushing && queue_size_val == 0 && pending <= 0) + { + break; + } + + if (flush_wait_count % 1000 == 0 && flush_wait_count > 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "Still waiting for background flushes (waited %d seconds, queue_size=%zu, " + "any_flushing=%d, pending=%d)", + flush_wait_count / 1000, queue_size_val, any_flushing, pending); + } + + pthread_rwlock_unlock(&db->cf_list_lock); + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + flush_wait_count++; + pthread_rwlock_rdlock(&db->cf_list_lock); + } + pthread_rwlock_unlock(&db->cf_list_lock); + TDB_DEBUG_LOG(TDB_LOG_INFO, "All background flushes completed (queue is empty)"); + + /* we wait for any in-progress compactions to complete before shutdown + * this prevents data loss from compaction removing old ssts while + * the new merged sst is not yet fully persisted */ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Waiting for in-progress compactions to complete"); + int compaction_wait_count = 0; + while (1) + { + int any_compacting = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i]) + { + if (atomic_load_explicit(&db->column_families[i]->is_compacting, + memory_order_acquire)) + { + any_compacting = 1; + break; + } + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + if (!any_compacting) + { + break; + } + + if (compaction_wait_count % 100 == 0 && compaction_wait_count > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Still waiting for in-progress compactions (waited %d ms)", + compaction_wait_count); + } + + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + compaction_wait_count++; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "All in-progress compactions completed"); + + if (db->flush_queue) + { + /* we set shutdown flag first, before enqueueing NULLs + * this ensures queue_dequeue_wait will return NULL even if + * a thread enters the wait after we broadcast */ + queue_shutdown(db->flush_queue); + + /* we enqueue NULL items for each thread as a courtesy + * (not strictly needed since shutdown=1, but maintains consistency) */ + for (int i = 0; i < db->config.num_flush_threads; i++) + { + queue_enqueue(db->flush_queue, NULL); + } + + for (int attempt = 0; attempt < TDB_SHUTDOWN_BROADCAST_ATTEMPTS; attempt++) + { + queue_shutdown(db->flush_queue); + usleep(TDB_SHUTDOWN_BROADCAST_INTERVAL_US); + } + } + + if (db->compaction_queue) + { + /* we set shutdown flag first, before enqueueing NULLs + * this ensures queue_dequeue_wait will return NULL even if + * a thread enters the wait after we broadcast */ + queue_shutdown(db->compaction_queue); + for (int i = 0; i < db->config.num_compaction_threads; i++) + { + queue_enqueue(db->compaction_queue, NULL); + } + + /* we keep broadcasting periodically until all threads have exited + * this handles the race where a thread might be between the while loop check + * and pthread_cond_wait when we set shutdown=1 */ + for (int attempt = 0; attempt < TDB_SHUTDOWN_BROADCAST_ATTEMPTS; attempt++) + { + queue_shutdown(db->compaction_queue); + usleep(TDB_SHUTDOWN_BROADCAST_INTERVAL_US); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Waiting for %d flush threads to finish", + db->config.num_flush_threads); + if (db->flush_threads) + { + for (int i = 0; i < db->config.num_flush_threads; i++) + { + if (db->flush_queue) + { + for (int attempt = 0; attempt < TDB_SHUTDOWN_BROADCAST_ATTEMPTS; attempt++) + { + queue_shutdown(db->flush_queue); + usleep(TDB_SHUTDOWN_BROADCAST_INTERVAL_US); + } + } + + pthread_join(db->flush_threads[i], NULL); + } + free(db->flush_threads); + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flush threads finished"); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Waiting for %d compaction threads to finish", + db->config.num_compaction_threads); + if (db->compaction_threads) + { + for (int i = 0; i < db->config.num_compaction_threads; i++) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Joining compaction thread %d", i); + + /** on netbsd, pthread_cond_wait can miss signals, so we keep broadcasting + * while waiting for each thread to exit */ + if (db->compaction_queue) + { + for (int attempt = 0; attempt < TDB_SHUTDOWN_BROADCAST_ATTEMPTS; attempt++) + { + queue_shutdown(db->compaction_queue); + usleep(TDB_SHUTDOWN_BROADCAST_INTERVAL_US); + } + } + + pthread_join(db->compaction_threads[i], NULL); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Compaction thread %d joined", i); + } + free(db->compaction_threads); + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "Compaction threads finished"); + + if (atomic_load(&db->sync_thread_active)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Stopping sync worker thread"); + + pthread_mutex_lock(&db->sync_thread_mutex); + atomic_store(&db->sync_thread_active, 0); + pthread_cond_signal(&db->sync_thread_cond); + pthread_mutex_unlock(&db->sync_thread_mutex); + + for (int attempt = 0; attempt < TDB_SHUTDOWN_BROADCAST_ATTEMPTS; attempt++) + { + pthread_mutex_lock(&db->sync_thread_mutex); + pthread_cond_signal(&db->sync_thread_cond); + pthread_mutex_unlock(&db->sync_thread_mutex); + usleep(TDB_SHUTDOWN_BROADCAST_INTERVAL_US); + } + + pthread_join(db->sync_thread, NULL); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Sync worker thread stopped"); + } + + /*** we always destroy sync mutex/cond since they're always initialized */ + pthread_mutex_destroy(&db->sync_thread_mutex); + pthread_cond_destroy(&db->sync_thread_cond); + pthread_mutex_destroy(&db->btree_cache_lock); + pthread_mutex_destroy(&db->compaction_gate_lock); + + if (atomic_load(&db->reaper_active)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Stopping reaper thread"); + + /** we set shutdown flag inside mutex to ensure proper synchronization + * with the worker's while loop predicate check (NetBSD PR #56275) */ + pthread_mutex_lock(&db->reaper_thread_mutex); + atomic_store(&db->reaper_active, 0); + pthread_cond_signal(&db->reaper_thread_cond); + pthread_mutex_unlock(&db->reaper_thread_mutex); + + /* we keep signaling periodically as a fallback for edge cases */ + for (int attempt = 0; attempt < TDB_SHUTDOWN_BROADCAST_ATTEMPTS; attempt++) + { + pthread_mutex_lock(&db->reaper_thread_mutex); + pthread_cond_signal(&db->reaper_thread_cond); + pthread_mutex_unlock(&db->reaper_thread_mutex); + usleep(TDB_SHUTDOWN_BROADCAST_INTERVAL_US); + } + + pthread_join(db->reaper_thread, NULL); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Reaper thread stopped"); + + pthread_mutex_destroy(&db->reaper_thread_mutex); + pthread_cond_destroy(&db->reaper_thread_cond); + } + + /* stop the replica sync thread (replica mode only). the exchange claims the + * shutdown so tidesdb_promote_to_primary and close never double-join it. */ + if (atomic_exchange_explicit(&db->replica_sync_thread_active, 0, memory_order_acq_rel) == 1) + { + pthread_join(db->replica_sync_thread, NULL); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica sync thread stopped"); + } + + /* we drain any remaining deferred frees after reaper thread has stopped */ + tidesdb_deferred_free_drain(db); + + if (db->flush_queue) + { + while (!queue_is_empty(db->flush_queue)) + { + tidesdb_flush_work_t *work = (tidesdb_flush_work_t *)queue_dequeue(db->flush_queue); + if (work) + { + /* we each flush work holds a reference to the immutable memtable + * rotation requests (imm == NULL) have no ref to release */ + if (work->imm) tidesdb_immutable_memtable_unref(work->imm); + free(work); + } + } + queue_free(db->flush_queue); + } + + if (db->compaction_queue) + { + while (!queue_is_empty(db->compaction_queue)) + { + tidesdb_compaction_work_t *work = + (tidesdb_compaction_work_t *)queue_dequeue(db->compaction_queue); + if (work) + { + /* signal a blocking caller before discarding so it does not + * park forever on a work item the close path drained without + * running */ + tidesdb_compaction_work_signal_done(work); + free(work); + } + } + queue_free(db->compaction_queue); + } + + /* we shut down upload pipeline before cleaning up unified memtable state, + * so that any async WAL uploads enqueued during flush complete before + * the local WAL files are deleted below */ + if (db->upload_queue) + { + /***** we send NULL poison pills to stop worker threads, then signal all waiters. + **** queue_enqueue only signals when the queue transitions from empty to non-empty, + *** so rapid enqueue of multiple NULLs may only wake one waiter. the shutdown + ** broadcast ensures all blocked workers wake up immediately. */ + for (int i = 0; i < db->num_upload_threads; i++) + { + queue_enqueue(db->upload_queue, NULL); + } + queue_shutdown(db->upload_queue); + /* we join all upload threads */ + if (db->upload_threads) + { + for (int i = 0; i < db->num_upload_threads; i++) + { + pthread_join(db->upload_threads[i], NULL); + } + free(db->upload_threads); + db->upload_threads = NULL; + } + queue_free(db->upload_queue); + db->upload_queue = NULL; + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Upload pipeline stopped (%" PRIu64 " uploads, %" PRIu64 " failures)", + atomic_load(&db->total_uploads), atomic_load(&db->total_upload_failures)); + } + + /* we clean up unified memtable state if enabled */ + if (db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = atomic_load(&db->unified_mt.active); + if (umt) + { + if (umt->skip_list) skip_list_free(umt->skip_list); + if (umt->wal) + { + char *wal_path = tdb_strdup(umt->wal->file_path); + block_manager_close(umt->wal); + if (wal_path) + { + tdb_unlink(wal_path); + free(wal_path); + } + } + free(umt); + } + + if (db->unified_mt.immutables) + { + while (!queue_is_empty(db->unified_mt.immutables)) + { + tidesdb_immutable_memtable_t *imm = + (tidesdb_immutable_memtable_t *)queue_dequeue(db->unified_mt.immutables); + if (imm) tidesdb_immutable_memtable_unref(imm); + } + queue_free(db->unified_mt.immutables); + } + + tidesdb_unimap_free(db); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified memtable state cleaned up"); + } + + /*** we clean up all immutable memtables that remain in CF queues + ** after flush workers have exited, we need to clean up any remaining immutables + * whether flushed or not */ + pthread_rwlock_wrlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (cf && cf->immutable_memtables) + { + int queue_count = (int)queue_size(cf->immutable_memtables); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' %d immutables in queue before shutdown cleanup", + cf->name, queue_count); + int cleaned = 0; + int skipped = 0; + + /*** we also only clean up immutable memtables that have been flushed + ** unflushed immutables still contain data that needs to be persisted + * they will be recovered from WAL on next startup */ + size_t queue_size_before = queue_size(cf->immutable_memtables); + for (size_t idx = 0; idx < queue_size_before; idx++) + { + tidesdb_immutable_memtable_t *imm = + (tidesdb_immutable_memtable_t *)queue_dequeue(cf->immutable_memtables); + if (imm) + { + int is_flushed = atomic_load_explicit(&imm->flushed, memory_order_acquire); + int refcount = atomic_load_explicit(&imm->refcount, memory_order_acquire); + + if (is_flushed) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' cleaning up flushed immutable with refcount=%d", + cf->name, refcount); + tidesdb_immutable_memtable_unref(imm); + cleaned++; + } + else + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "CF '%s' skipping unflushed immutable with refcount=%d (data in WAL)", + cf->name, refcount); + queue_enqueue(cf->immutable_memtables, imm); + skipped++; + } + } + } + if (cleaned > 0 || skipped > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' cleaned up %d flushed immutables, skipped %d unflushed " + "during shutdown", + cf->name, cleaned, skipped); + } + } + } + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_free(db->column_families[i]); + } + free(db->column_families); + pthread_rwlock_unlock(&db->cf_list_lock); + + pthread_rwlock_destroy(&db->cf_list_lock); + + tidesdb_comparator_entry_t *comparators = + atomic_load_explicit(&db->comparators, memory_order_relaxed); + if (comparators) + { + free(comparators); + } + + free(db->db_path); + /* free the owned copy of object_store_config created in tidesdb_open */ + if (db->config.object_store_config) + { + free((tidesdb_objstore_config_t *)db->config.object_store_config); + db->config.object_store_config = NULL; + } + + if (db->clock_cache) + { + clock_cache_stats_t stats; + clock_cache_get_stats(db->clock_cache, &stats); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Freeing clock cache (bytes: %zu, entries: %zu)", + stats.total_bytes, stats.total_entries); + clock_cache_destroy(db->clock_cache); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Clock cache freed"); + } + + if (db->btree_node_cache) + { + clock_cache_stats_t stats; + clock_cache_get_stats(db->btree_node_cache, &stats); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Freeing btree node cache (bytes: %zu, entries: %zu)", + stats.total_bytes, stats.total_entries); + clock_cache_destroy(db->btree_node_cache); + TDB_DEBUG_LOG(TDB_LOG_INFO, "B+tree node cache freed"); + } + + if (db->commit_status) + { + tidesdb_commit_status_destroy(db->commit_status); + } + + if (db->active_txns) + { + free(db->active_txns); + pthread_rwlock_destroy(&db->active_txns_lock); + } + + if (db->lock_fd >= 0) + { + tdb_file_unlock(db->lock_fd); + close(db->lock_fd); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Released database directory lock"); + } + + /* we clean up object store resources */ + if (db->local_cache) + { + tdb_local_cache_destroy(db->local_cache); + free(db->local_cache); + db->local_cache = NULL; + } + if (db->object_store) + { + if (db->object_store->destroy) + { + db->object_store->destroy(db->object_store->ctx); + } + free(db->object_store); + db->object_store = NULL; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "TidesDB closed successfully"); + + /* we close log file if it was opened (protected by log mutex) */ + pthread_mutex_lock(&tidesdb_log_mutex); + if (_tidesdb_log_file) + { + fflush(_tidesdb_log_file); + fclose(_tidesdb_log_file); + _tidesdb_log_file = NULL; + _tidesdb_log_truncate = 0; + _tidesdb_log_path[0] = '\0'; + } + db->log_file = NULL; + pthread_mutex_unlock(&tidesdb_log_mutex); + + free(db); + + db = NULL; + + return TDB_SUCCESS; +} + +int tidesdb_promote_to_primary(tidesdb_t *db) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + if (!atomic_load_explicit(&db->replica_mode, memory_order_acquire)) + return TDB_ERR_INVALID_ARGS; /* already primary */ + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Promoting replica to primary mode"); + + /* stop the dedicated replica sync thread before flipping replica_mode. + * joining it drains any in-flight MANIFEST sync / WAL replay -- flipping + * replica_mode mid-sync would let a query thread waiting on cf_list_lock as + * wrlock block behind the sync's rdlock, an apparent hang on the first + * query after promotion. the exchange claims the shutdown so a later close + * does not double-join the thread. */ + if (atomic_exchange_explicit(&db->replica_sync_thread_active, 0, memory_order_acq_rel) == 1) + { + pthread_join(db->replica_sync_thread, NULL); + } + + /* final MANIFEST sync and WAL replay to catch last writes from old primary */ + if (db->object_store) + { + tdb_replica_sync_manifests(db); + + if (db->unified_mt.enabled && db->config.object_store_config && + db->config.object_store_config->replica_replay_wal) + { + tdb_objstore_replay_remote_wals(db, 0); + } + } + + /* we create local WAL for the unified memtable if it does not have one. + * replicas do not write local WALs, but as primary we need one for + * crash recovery of new writes. */ + if (db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = + atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (umt && !umt->wal) + { + char uwal_path[TDB_MAX_PATH_LEN]; + uint64_t gen = + atomic_load_explicit(&db->unified_mt.wal_generation, memory_order_relaxed); + snprintf(uwal_path, sizeof(uwal_path), + "%s" PATH_SEPARATOR TDB_UNIFIED_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, + db->db_path, TDB_U64_CAST(gen)); + + block_manager_t *new_wal = NULL; + if (block_manager_open(&new_wal, uwal_path, TDB_SYNC_FULL) == 0) + { + block_manager_truncate(new_wal); + umt->wal = new_wal; + TDB_DEBUG_LOG(TDB_LOG_INFO, "Created WAL for promoted primary: %s", uwal_path); + } + } + } + + /* we switch to primary mode.. */ + atomic_store_explicit(&db->replica_mode, 0, memory_order_release); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replica promoted to primary successfully"); + return TDB_SUCCESS; +} + +/** + * tidesdb_unimap_persist + * atomically rewrites the UNIMAP file from the in-memory cf index map. + * the caller must hold unified_mt.cf_index_map_lock. + * @param db database handle + * @return error code + */ +static int tidesdb_unimap_persist(tidesdb_t *db) +{ + char tmp_path[TDB_MAX_PATH_LEN]; + char final_path[TDB_MAX_PATH_LEN]; + snprintf(tmp_path, sizeof(tmp_path), "%s" PATH_SEPARATOR TDB_UNIFIED_CF_INDEX_MAP_TMP, + db->db_path); + snprintf(final_path, sizeof(final_path), "%s" PATH_SEPARATOR TDB_UNIFIED_CF_INDEX_MAP_FILE, + db->db_path); + + FILE *fp = fopen(tmp_path, TDB_CNF_FILE_MODE); + if (!fp) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to open %s for write", tmp_path); + return TDB_ERR_IO; + } + + for (int i = 0; i < db->unified_mt.cf_index_map_count; i++) + { + fprintf(fp, "%u %s\n", db->unified_mt.cf_index_map[i].index, + db->unified_mt.cf_index_map[i].name); + } + + if (fflush(fp) != 0 || tdb_fsync(fileno(fp)) != 0) + { + fclose(fp); + tdb_unlink(tmp_path); + return TDB_ERR_IO; + } + fclose(fp); + + /* atomic_rename_file replaces the target and syncs the parent directory */ + if (atomic_rename_file(tmp_path, final_path) != 0) + { + tdb_unlink(tmp_path); + return TDB_ERR_IO; + } + + /* in object store mode the map must reach the store like config.ini and + * MANIFEST so replicas reconstruct cf indexes the same way the primary did */ + if (db->object_store) + { + tdb_objstore_upload_file_sync(db, final_path); + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_unimap_objstore_pull + * downloads the UNIMAP file from the object store to db_path. when overwrite + * is 0 the download is skipped if a local UNIMAP already exists, so a + * primary's authoritative local map is never clobbered; replicas pass 1 to + * always track the primary. best effort -- a missing remote object is not an + * error, the node may be the first to write it. + * @param db database handle + * @param overwrite 1 to always download, 0 to skip when a local copy exists + */ +static void tidesdb_unimap_objstore_pull(tidesdb_t *db, int overwrite) +{ + if (!db->object_store) return; + + char local_path[TDB_MAX_PATH_LEN]; + snprintf(local_path, sizeof(local_path), "%s" PATH_SEPARATOR TDB_UNIFIED_CF_INDEX_MAP_FILE, + db->db_path); + + if (!overwrite) + { + struct STAT_STRUCT st; + if (STAT_FUNC(local_path, &st) == 0) return; /* local copy is authoritative */ + } + + if (db->object_store->get(db->object_store->ctx, TDB_UNIFIED_CF_INDEX_MAP_FILE, local_path) != + 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "No UNIMAP in object store yet (or download failed)"); + } +} + +/** + * tidesdb_unimap_load + * (re)reads the UNIMAP file into the in-memory cf index map and advances + * next_cf_index past every persisted index. the map is cleared first so the + * call is idempotent and usable as a replica re-sync reload. a missing file + * is a fresh database and not an error. takes cf_index_map_lock. + * @param db database handle + * @return error code + */ +static int tidesdb_unimap_load(tidesdb_t *db) +{ + char path[TDB_MAX_PATH_LEN]; + snprintf(path, sizeof(path), "%s" PATH_SEPARATOR TDB_UNIFIED_CF_INDEX_MAP_FILE, db->db_path); + + pthread_mutex_lock(&db->unified_mt.cf_index_map_lock); + + const int prev_count = db->unified_mt.cf_index_map_count; + + /* a reload (replica re-sync) starts from a clean map */ + free(db->unified_mt.cf_index_map); + db->unified_mt.cf_index_map = NULL; + db->unified_mt.cf_index_map_count = 0; + db->unified_mt.cf_index_map_capacity = 0; + + FILE *fp = fopen(path, "r"); + if (!fp) + { + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); + return TDB_SUCCESS; /* fresh database, no map yet */ + } + + uint32_t max_index = 0; + int have_entry = 0; + char line[TDB_UNIFIED_CF_INDEX_MAP_LINE_MAX]; + while (fgets(line, sizeof(line), fp)) + { + /* each line is " " with the name running to end of line */ + char *sep = strchr(line, ' '); + if (!sep) continue; + *sep = '\0'; + char *name = sep + 1; + size_t name_len = strlen(name); + while (name_len > 0 && (name[name_len - 1] == '\n' || name[name_len - 1] == '\r')) + { + name[--name_len] = '\0'; + } + if (name_len == 0 || name_len >= TDB_MAX_CF_NAME_LEN) continue; + + uint32_t index = (uint32_t)strtoul(line, NULL, 10); + + if (db->unified_mt.cf_index_map_count >= db->unified_mt.cf_index_map_capacity) + { + int new_cap = db->unified_mt.cf_index_map_capacity == 0 + ? TDB_UNIFIED_CF_INDEX_MAP_INITIAL_CAP + : db->unified_mt.cf_index_map_capacity * 2; + tidesdb_unified_cf_index_entry_t *grown = realloc( + db->unified_mt.cf_index_map, new_cap * sizeof(tidesdb_unified_cf_index_entry_t)); + if (!grown) + { + free(db->unified_mt.cf_index_map); + db->unified_mt.cf_index_map = NULL; + db->unified_mt.cf_index_map_count = 0; + db->unified_mt.cf_index_map_capacity = 0; + fclose(fp); + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); + return TDB_ERR_MEMORY; + } + db->unified_mt.cf_index_map = grown; + db->unified_mt.cf_index_map_capacity = new_cap; + } + + tidesdb_unified_cf_index_entry_t *e = + &db->unified_mt.cf_index_map[db->unified_mt.cf_index_map_count++]; + snprintf(e->name, sizeof(e->name), "%s", name); + e->index = index; + + if (!have_entry || index > max_index) + { + max_index = index; + have_entry = 1; + } + } + fclose(fp); + + if (have_entry) + { + atomic_store_explicit(&db->unified_mt.next_cf_index, max_index + 1, memory_order_relaxed); + } + + /* steady-state replica re-syncs reload an unchanged map every tick -- log only on a change */ + if (db->unified_mt.cf_index_map_count != prev_count) + TDB_DEBUG_LOG(TDB_LOG_INFO, "Loaded UNIMAP with %d column family index entries", + db->unified_mt.cf_index_map_count); + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); + return TDB_SUCCESS; +} + +/** + * tidesdb_unimap_resolve + * returns the unified_cf_index for a column family name. an existing name + * keeps the index it was first assigned; a new name is assigned the next + * index and appended to the in-memory map. the caller persists the map with + * tidesdb_unimap_persist when out_is_new is set. takes cf_index_map_lock. + * @param db database handle + * @param name column family name + * @param out_index receives the resolved index + * @param out_is_new receives 1 if a new index was assigned, 0 otherwise + */ +static void tidesdb_unimap_resolve(tidesdb_t *db, const char *name, uint32_t *out_index, + int *out_is_new) +{ + *out_is_new = 0; + + pthread_mutex_lock(&db->unified_mt.cf_index_map_lock); + + for (int i = 0; i < db->unified_mt.cf_index_map_count; i++) + { + if (strcmp(db->unified_mt.cf_index_map[i].name, name) == 0) + { + *out_index = db->unified_mt.cf_index_map[i].index; + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); + return; + } + } + + uint32_t assigned = + atomic_fetch_add_explicit(&db->unified_mt.next_cf_index, 1, memory_order_relaxed); + *out_index = assigned; + + if (db->unified_mt.cf_index_map_count >= db->unified_mt.cf_index_map_capacity) + { + int new_cap = db->unified_mt.cf_index_map_capacity == 0 + ? TDB_UNIFIED_CF_INDEX_MAP_INITIAL_CAP + : db->unified_mt.cf_index_map_capacity * 2; + tidesdb_unified_cf_index_entry_t *grown = realloc( + db->unified_mt.cf_index_map, new_cap * sizeof(tidesdb_unified_cf_index_entry_t)); + if (!grown) + { + /* the cf still works this session with the assigned index, but the + * map cannot be grown to record it, so it will not be persisted */ + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to grow UNIMAP for CF '%s'", name); + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); + return; + } + db->unified_mt.cf_index_map = grown; + db->unified_mt.cf_index_map_capacity = new_cap; + } + + tidesdb_unified_cf_index_entry_t *e = + &db->unified_mt.cf_index_map[db->unified_mt.cf_index_map_count++]; + snprintf(e->name, sizeof(e->name), "%s", name); + e->index = assigned; + *out_is_new = 1; + + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); +} + +/** + * tidesdb_unimap_remove + * drops a column family name from the in-memory map and rewrites UNIMAP. + * next_cf_index is left untouched so a dropped index is never reused. + * @param db database handle + * @param name column family name + */ +static void tidesdb_unimap_remove(tidesdb_t *db, const char *name) +{ + pthread_mutex_lock(&db->unified_mt.cf_index_map_lock); + + for (int i = 0; i < db->unified_mt.cf_index_map_count; i++) + { + if (strcmp(db->unified_mt.cf_index_map[i].name, name) == 0) + { + for (int j = i; j < db->unified_mt.cf_index_map_count - 1; j++) + { + db->unified_mt.cf_index_map[j] = db->unified_mt.cf_index_map[j + 1]; + } + db->unified_mt.cf_index_map_count--; + tidesdb_unimap_persist(db); + break; + } + } + + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); +} + +/** + * tidesdb_unimap_rename + * updates a column family name in the in-memory map, keeping its index, and + * rewrites UNIMAP. + * @param db database handle + * @param old_name current column family name + * @param new_name new column family name + */ +static void tidesdb_unimap_rename(tidesdb_t *db, const char *old_name, const char *new_name) +{ + pthread_mutex_lock(&db->unified_mt.cf_index_map_lock); + + for (int i = 0; i < db->unified_mt.cf_index_map_count; i++) + { + if (strcmp(db->unified_mt.cf_index_map[i].name, old_name) == 0) + { + snprintf(db->unified_mt.cf_index_map[i].name, + sizeof(db->unified_mt.cf_index_map[i].name), "%s", new_name); + tidesdb_unimap_persist(db); + break; + } + } + + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); +} + +/** + * tidesdb_unimap_free + * releases the in-memory cf index map and its lock, called from + * tidesdb_close for unified mode databases. + * @param db database handle + */ +static void tidesdb_unimap_free(tidesdb_t *db) +{ + free(db->unified_mt.cf_index_map); + db->unified_mt.cf_index_map = NULL; + db->unified_mt.cf_index_map_count = 0; + db->unified_mt.cf_index_map_capacity = 0; + pthread_mutex_destroy(&db->unified_mt.cf_index_map_lock); + pthread_mutex_destroy(&db->unified_mt.wal_group_sync_lock); + pthread_cond_destroy(&db->unified_mt.wal_group_sync_cond); +} + +int tidesdb_create_column_family(tidesdb_t *db, const char *name, + const tidesdb_column_family_config_t *config) +{ + if (!db || !name || !config) return TDB_ERR_INVALID_ARGS; + + /* reject names that would truncate into cf->config.name (TDB_MAX_CF_NAME_LEN) + * -- mirrors the guard in tidesdb_rename_column_family so cf->name, the + * registry key, and cf->config.name can never disagree */ + const size_t name_len = strlen(name); + if (name_len == 0 || name_len >= TDB_MAX_CF_NAME_LEN) return TDB_ERR_INVALID_ARGS; + + if (!atomic_load(&db->is_recovering)) + { + int wait_result = wait_for_open(db); + if (wait_result != TDB_SUCCESS) return wait_result; + + if (atomic_load(&db->replica_mode)) return TDB_ERR_READONLY; + } + + if (config->sync_mode == TDB_SYNC_INTERVAL && config->sync_interval_us == 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Invalid config TDB_SYNC_INTERVAL requires sync_interval_us > 0"); + return TDB_ERR_INVALID_ARGS; + } + + /** unified memtable mode requires all CFs to use memcmp comparator + * because the single shared skip list uses a single comparator.. */ + if (db->unified_mt.enabled) + { + int has_custom = + (config->comparator_name[0] != '\0' && strcmp(config->comparator_name, "memcmp") != 0); + if (has_custom) + { + TDB_DEBUG_LOG( + TDB_LOG_ERROR, + "CF '%s' requires comparator '%s' but unified memtable mode requires memcmp. " + "Disable unified_memtable or use memcmp comparator.", + name, config->comparator_name); + return TDB_ERR_INVALID_ARGS; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Creating column family %s", name); + + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i] && strcmp(db->column_families[i]->name, name) == 0) + { + pthread_rwlock_unlock(&db->cf_list_lock); + TDB_DEBUG_LOG(TDB_LOG_WARN, "Column family %s already exists", name); + return TDB_ERR_EXISTS; + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + tidesdb_column_family_t *cf = calloc(1, sizeof(tidesdb_column_family_t)); + if (!cf) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to allocate memory for column family structure"); + return TDB_ERR_MEMORY; + } + + cf->name = tdb_strdup(name); + if (!cf->name) + { + free(cf); + return TDB_ERR_MEMORY; + } + + char dir_path[TDB_MAX_PATH_LEN]; + snprintf(dir_path, sizeof(dir_path), "%s" PATH_SEPARATOR "%s", db->db_path, name); + + struct stat st = {0}; + if (stat(dir_path, &st) == -1) + { + if (mkdir(dir_path, TDB_DIR_PERMISSIONS) != 0) + { + free(cf->name); + free(cf); + return TDB_ERR_IO; + } + + /*** we sync parent directory to ensure directory entry is persisted + ** without this, the directory might not survive a crash/close + * uses cross-platform tdb_sync_directory (no-op on Windows, fsync on POSIX) */ + tdb_sync_directory(db->db_path); + } + + cf->directory = tdb_strdup(dir_path); + if (!cf->directory) + { + free(cf->name); + free(cf); + return TDB_ERR_MEMORY; + } + + cf->config = *config; + snprintf(cf->config.name, sizeof(cf->config.name), "%s", name); + cf->db = db; + + /* in unified memtable mode the cf needs a stable index that prefixes its + * keys in the shared skip_list and wal. tidesdb_unimap_resolve hands back + * the index this name was first assigned (persisted in UNIMAP) or assigns + * a fresh one. a freshly assigned index is persisted after the cf is + * registered, below. */ + int unimap_is_new = 0; + if (db->unified_mt.enabled) + { + tidesdb_unimap_resolve(db, name, &cf->unified_cf_index, &unimap_is_new); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' assigned unified_cf_index=%u", name, + cf->unified_cf_index); + } + + /* we validate and fix index_sample_ratio (must be at least 1 to avoid division by zero) */ + if (cf->config.index_sample_ratio < 1) + { + cf->config.index_sample_ratio = TDB_DEFAULT_INDEX_SAMPLE_RATIO; + } + + /* we validate and fix block_index_prefix_len */ + if (cf->config.block_index_prefix_len < TDB_BLOCK_INDEX_PREFIX_MIN || + cf->config.block_index_prefix_len > TDB_BLOCK_INDEX_PREFIX_MAX) + { + cf->config.block_index_prefix_len = TDB_DEFAULT_BLOCK_INDEX_PREFIX_LEN; + } + + /**** we validate write_buffer_size against resolved_memory_limit to prevent + *** creating CFs that would immediately cause critical memory pressure. + ** arena allocation is write_buffer_size * 2, so a single CF's arena + * must not exceed the global memory budget */ + { + const size_t mem_limit = + atomic_load_explicit(&db->resolved_memory_limit, memory_order_relaxed); + const size_t arena_size = cf->config.write_buffer_size * 2; + if (mem_limit > 0 && arena_size > mem_limit) + { + TDB_DEBUG_LOG(TDB_LOG_FATAL, + "CF '%s' write_buffer_size %zu (arena %zu bytes) exceeds " + "resolved_memory_limit %zu bytes", + name, cf->config.write_buffer_size, arena_size, mem_limit); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_INVALID_ARGS; + } + + /* we warn if cumulative active memtable arenas would exceed memory limit */ + if (mem_limit > 0) + { + size_t cumulative_arena = arena_size; + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i]) + cumulative_arena += db->column_families[i]->config.write_buffer_size * 2; + } + pthread_rwlock_unlock(&db->cf_list_lock); + + if (cumulative_arena > mem_limit) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' creation brings cumulative arena overhead to %zu bytes " + "which exceeds resolved_memory_limit %zu bytes -- " + "memory pressure may be frequent", + name, cumulative_arena, mem_limit); + } + } + } + + skip_list_t *new_memtable = NULL; + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + + /* we check if a custom comparator is specified */ + int has_custom_comparator = + (config->comparator_name[0] != '\0' && strcmp(config->comparator_name, "memcmp") != 0); + + if (tidesdb_get_comparator(db, config->comparator_name, &comparator_fn, &comparator_ctx) != + TDB_SUCCESS) + { + if (has_custom_comparator) + { + TDB_DEBUG_LOG( + TDB_LOG_FATAL, + "Column family '%s' requires comparator '%s' but it is not registered. " + "Register comparator with tidesdb_register_comparator() before opening database.", + name, config->comparator_name); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_NOT_FOUND; + } + + /* no comparator specified or explicitly requested memcmp, we use default */ + comparator_fn = skip_list_comparator_memcmp; + comparator_ctx = NULL; + } + + cf->config.comparator_fn_cached = comparator_fn; + cf->config.comparator_ctx_cached = comparator_ctx; + + if (skip_list_new_with_arena(&new_memtable, config->skip_list_max_level, + config->skip_list_probability, comparator_fn, comparator_ctx, + &db->cached_current_time, config->write_buffer_size * 2) != 0) + { + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_MEMORY; + } + + cf->immutable_memtables = queue_new(); + if (!cf->immutable_memtables) + { + skip_list_free(new_memtable); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_MEMORY; + } + + /* we init lock-free immutable snapshot (both slots empty). the per-slot items + * arrays are allocated later, once the cf is otherwise fully built (see below), + * so the inline error paths between here and there need not free them -- items + * stays NULL until then. */ + for (int s = 0; s < TDB_IMM_SNAP_SLOTS; s++) + { + memset(&cf->imm_snaps[s], 0, sizeof(tidesdb_imm_snap_t)); + atomic_init(&cf->imm_snaps[s].count, 0); + atomic_init(&cf->imm_snaps[s].readers, 0); + } + atomic_init(&cf->imm_snap_active, 0); + atomic_init(&cf->active_mt_readers, 0); + pthread_mutex_init(&cf->imm_snap_publish_lock, NULL); + pthread_mutex_init(&cf->compaction_commit_lock, NULL); + + /*** in unified memtable mode, writes go through the unified WAL so + ** per-CF WAL files are not needed. skip creation to avoid wasted + * I/O, file descriptors, and confusing artifacts on disk. */ + block_manager_t *new_wal = NULL; + uint64_t active_wal_id = 0; + if (!db->unified_mt.enabled) + { + /*** the active memtable's wal is the highest-id wal_*.log in the cf + ** directory--rotation always allocates a strictly higher id, so on a + * crash-reopen the highest existing file is the wal that was active. + ** we adopt it -- open without truncating, validate to trim any + *** preallocation tail -- so recovery can replay it in place. lower-id + ** wals are immutables recovery handles separately. a fresh cf has no + * wal files, so we fall back to creating wal_0.log. */ + int have_existing_wal = 0; + DIR *wal_scan = opendir(cf->directory); + if (wal_scan) + { + struct dirent *we; + while ((we = readdir(wal_scan)) != NULL) + { + uint64_t wid = 0; + if (tdb_parse_wal_id(we->d_name, &wid)) + { + if (!have_existing_wal || wid > active_wal_id) + { + active_wal_id = wid; + have_existing_wal = 1; + } + } + } + closedir(wal_scan); + } + + char wal_path[TDB_MAX_PATH_LEN]; + snprintf(wal_path, sizeof(wal_path), + "%s" PATH_SEPARATOR TDB_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, cf->directory, + TDB_U64_CAST(active_wal_id)); + + if (block_manager_open(&new_wal, wal_path, config->sync_mode) != 0) + { + queue_free(cf->immutable_memtables); + skip_list_free(new_memtable); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_IO; + } + + if (have_existing_wal) + { + /* adopt an existing wal -- validate (permissive) to trim the + * preallocation tail so the block manager's logical size is the + * real data extent and appends land in the right place. recovery + * replays this file's entries into this memtable's skip list. */ + if (block_manager_validate_last_block(new_wal, + BLOCK_MANAGER_PERMISSIVE_BLOCK_VALIDATION) != 0) + { + block_manager_close(new_wal); + queue_free(cf->immutable_memtables); + skip_list_free(new_memtable); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_IO; + } + } + else if (block_manager_truncate(new_wal) != 0) + { + /* fresh cf -- start wal_0.log empty */ + block_manager_close(new_wal); + queue_free(cf->immutable_memtables); + skip_list_free(new_memtable); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_IO; + } + } + + tidesdb_memtable_t *initial_mt = malloc(sizeof(tidesdb_memtable_t)); + if (!initial_mt) + { + if (new_wal) block_manager_close(new_wal); + queue_free(cf->immutable_memtables); + skip_list_free(new_memtable); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_MEMORY; + } + initial_mt->skip_list = new_memtable; + initial_mt->wal = new_wal; /* NULL in unified mode */ + /* mt->id matches the backing wal's file id -- the highest existing wal_*.log + * on a crash-reopen, or 0 for the fresh wal_0.log of a brand-new cf */ + initial_mt->id = active_wal_id; + initial_mt->generation = 0; + atomic_init(&initial_mt->refcount, 1); + atomic_init(&initial_mt->writers, 0); + atomic_init(&initial_mt->flushed, 0); + atomic_init(&cf->active_memtable, initial_mt); + + int min_levels = cf->config.min_levels; + + /* the engine assumes at least one disk level exists -- apply_backpressure, flush, and the + * read path all dereference cf->levels[0]. clamp a misconfigured 0/negative min_levels up + * to 1 so a bad config value cannot null-deref on the first write. */ + if (min_levels < 1) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' min_levels %d below floor clamped to 1", cf->name, + min_levels); + min_levels = 1; + cf->config.min_levels = 1; + } + + /* we check if directory already has existing levels from disk */ + DIR *existing_dir = opendir(cf->directory); + int max_existing_level = 0; + if (existing_dir) + { + struct dirent *entry; + while ((entry = readdir(existing_dir)) != NULL) + { + if (strstr(entry->d_name, TDB_SSTABLE_KLOG_EXT) != NULL) + { + int level_num = 0; + if (tdb_parse_level_num(entry->d_name, &level_num)) + { + if (level_num > max_existing_level) + { + max_existing_level = level_num; + } + } + } + } + closedir(existing_dir); + } + + /* we ensure we have enough levels for existing data */ + if (max_existing_level > min_levels) + { + min_levels = max_existing_level; + } + + /* we validate we dont exceed max levels */ + if (min_levels > TDB_MAX_LEVELS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Cannot create CF requires %d levels but max is %d", min_levels, + TDB_MAX_LEVELS); + tidesdb_memtable_t *mt_cleanup = atomic_load(&cf->active_memtable); + if (mt_cleanup) + { + if (mt_cleanup->skip_list) skip_list_free(mt_cleanup->skip_list); + if (mt_cleanup->wal) block_manager_close(mt_cleanup->wal); + free(mt_cleanup); + } + queue_free(cf->immutable_memtables); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_INVALID_ARGS; + } + + /* we initialize fixed levels array and create min_levels, rest are NULL */ + for (int i = 0; i < min_levels; i++) + { + /* base capacity is the buffer size B -- spooky DCA formula is + * C_i = B * T^(i-1), and tidesdb_add_level passes B as well. passing + * B*T here inflated every initial level by one ratio step */ + size_t level_capacity = tidesdb_calculate_level_capacity(i + 1, config->write_buffer_size, + config->level_size_ratio); + + cf->levels[i] = tidesdb_level_create(i + 1, level_capacity); + if (!cf->levels[i]) + { + /* we cleanup already created levels */ + for (int cleanup_idx = 0; cleanup_idx < i; cleanup_idx++) + { + if (cf->levels[cleanup_idx]) + { + tidesdb_level_free(db, cf->levels[cleanup_idx]); + } + } + tidesdb_memtable_t *mt_cleanup2 = atomic_load(&cf->active_memtable); + if (mt_cleanup2) + { + if (mt_cleanup2->skip_list) skip_list_free(mt_cleanup2->skip_list); + if (mt_cleanup2->wal) block_manager_close(mt_cleanup2->wal); + free(mt_cleanup2); + } + queue_free(cf->immutable_memtables); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_MEMORY; + } + TDB_DEBUG_LOG(TDB_LOG_INFO, "Creating level %d with capacity %zu", i + 1, level_capacity); + } + + /* we initialize remaining slots to NULL */ + for (int i = min_levels; i < TDB_MAX_LEVELS; i++) + { + cf->levels[i] = NULL; + } + + atomic_init(&cf->num_active_levels, min_levels); + + atomic_init(&cf->next_sstable_id, 0); + atomic_init(&cf->sstable_layout_version, 0); + atomic_init(&cf->is_compacting, 0); + atomic_init(&cf->is_flushing, 0); + atomic_init(&cf->flush_pending_count, 0); + atomic_init(&cf->flush_deferred, 0); + atomic_init(&cf->compaction_pending_count, 0); + atomic_init(&cf->compaction_armed, 0); + atomic_init(&cf->immutable_cleanup_counter, 0); + atomic_init(&cf->pending_commits, 0); + + char manifest_path[TDB_MAX_PATH_LEN]; + snprintf(manifest_path, sizeof(manifest_path), "%s" PATH_SEPARATOR "%s", cf->directory, + TDB_COLUMN_FAMILY_MANIFEST_NAME); + cf->manifest = tidesdb_manifest_open(manifest_path); + if (!cf->manifest) + { + /* we cleanup all created levels */ + for (int cleanup_idx = 0; cleanup_idx < min_levels; cleanup_idx++) + { + if (cf->levels[cleanup_idx]) + { + tidesdb_level_free(db, cf->levels[cleanup_idx]); + } + } + + tidesdb_memtable_t *mt_cleanup4 = atomic_load(&cf->active_memtable); + if (mt_cleanup4) + { + if (mt_cleanup4->skip_list) skip_list_free(mt_cleanup4->skip_list); + if (mt_cleanup4->wal) block_manager_close(mt_cleanup4->wal); + free(mt_cleanup4); + } + queue_free(cf->immutable_memtables); + free(cf->directory); + free(cf->name); + free(cf); + return TDB_ERR_MEMORY; + } + + /* allocate the lock-free immutable snapshot slots now that the cf is fully built + * but not yet registered. doing it here means every inline error path above ran + * while items were NULL (nothing to free), and a failure here unwinds through the + * full tidesdb_column_family_free, which frees both slots. each slot is sized to + * the hard cap; the publisher grows it on demand if a raised threshold needs more. */ + const size_t imm_snap_init_cap = tdb_cf_immutable_hard_cap(cf); + for (int s = 0; s < TDB_IMM_SNAP_SLOTS; s++) + { + cf->imm_snaps[s].items = malloc(imm_snap_init_cap * sizeof(tidesdb_memtable_t *)); + if (!cf->imm_snaps[s].items) + { + tidesdb_column_family_free(cf); + return TDB_ERR_MEMORY; + } + cf->imm_snaps[s].cap = imm_snap_init_cap; + } + + pthread_rwlock_wrlock(&db->cf_list_lock); + + /* the earlier duplicate scan ran under a read lock; re-check under the write + * lock so two concurrent creates of the same name cannot both append */ + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i] && strcmp(db->column_families[i]->name, name) == 0) + { + pthread_rwlock_unlock(&db->cf_list_lock); + tidesdb_column_family_free(cf); + TDB_DEBUG_LOG(TDB_LOG_WARN, "Column family %s already exists (lost create race)", name); + return TDB_ERR_EXISTS; + } + } + + if (db->num_column_families >= db->cf_capacity) + { + int new_cap = db->cf_capacity * 2; + tidesdb_column_family_t **new_array = + realloc(db->column_families, new_cap * sizeof(tidesdb_column_family_t *)); + if (!new_array) + { + pthread_rwlock_unlock(&db->cf_list_lock); + tidesdb_column_family_free(cf); + return TDB_ERR_MEMORY; + } + + for (int i = db->cf_capacity; i < new_cap; i++) + { + new_array[i] = NULL; + } + + db->column_families = new_array; + db->cf_capacity = new_cap; + } + + db->column_families[db->num_column_families] = cf; + db->num_column_families++; + pthread_rwlock_unlock(&db->cf_list_lock); + + /* persist a freshly assigned unified index now that the cf is registered */ + if (unimap_is_new) + { + pthread_mutex_lock(&db->unified_mt.cf_index_map_lock); + tidesdb_unimap_persist(db); + pthread_mutex_unlock(&db->unified_mt.cf_index_map_lock); + } + + /* we save configuration to disk for recovery */ + char config_path[MAX_FILE_PATH_LENGTH]; + snprintf(config_path, sizeof(config_path), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + cf->directory); + + int save_result = tidesdb_cf_config_save_to_ini(config_path, name, config); + if (save_result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to save CF config for '%s' (error: %d)", name, + save_result); + /* non-fatal, continue */ + } + + /* we upload config.ini to object store (sync -- small file, must be visible immediately) */ + if (db->object_store && save_result == TDB_SUCCESS) + { + tdb_objstore_upload_file_sync(db, config_path); + + /* commit + upload the empty MANIFEST so replicas can discover this CF + * before its first flush -- discovery keys off /MANIFEST */ + if (tidesdb_manifest_commit(cf->manifest, cf->manifest->path) == 0) + { + tdb_objstore_upload_file_sync(db, cf->manifest->path); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Created CF '%s' (total: %d)", name, db->num_column_families); + + /* a btree column family needs the btree node cache -- create it lazily here + * so a database that never uses btree mode does not allocate it */ + if (config->use_btree) tidesdb_ensure_btree_node_cache(db); + + /** we start sync thread if this CF needs interval syncing and thread isn't running + * but not during recovery -- tidesdb_open will handle thread creation after recovery */ + if (config->sync_mode == TDB_SYNC_INTERVAL && config->sync_interval_us > 0 && + !atomic_load(&db->is_recovering)) + { + if (!atomic_load(&db->sync_thread_active)) + { + atomic_store(&db->sync_thread_active, 1); + if (pthread_create(&db->sync_thread, NULL, tidesdb_sync_worker_thread, db) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create sync worker thread for new CF"); + atomic_store(&db->sync_thread_active, 0); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Sync worker thread started for CF '%s'", name); + } + } + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_drop_column_family_internal + * shared implementation for dropping a column family by name or pointer + * exactly one of name or cf must be non-NULL + * @param db database handle + * @param name column family name (NULL when dropping by pointer) + * @param cf column family pointer (NULL when dropping by name) + * @return 0 on success, -n on failure + */ +static int tidesdb_drop_column_family_internal(tidesdb_t *db, const char *name, + const tidesdb_column_family_t *cf) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + if (atomic_load(&db->replica_mode)) return TDB_ERR_READONLY; + + tidesdb_column_family_t *cf_to_drop = NULL; + + pthread_rwlock_wrlock(&db->cf_list_lock); + + /* we find the CF to drop */ + int found_idx = -1; + for (int i = 0; i < db->num_column_families; i++) + { + if (!db->column_families[i]) continue; + + /** when cf pointer is provided we match by pointer (skip name search) + * otherwise we match by name string */ + if ((cf && db->column_families[i] == cf) || + (name && strcmp(db->column_families[i]->name, name) == 0)) + { + found_idx = i; + cf_to_drop = db->column_families[i]; + break; + } + } + + if (found_idx == -1) + { + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_NOT_FOUND; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Dropping column family %s", cf_to_drop->name); + + /* we mark CF for deletion first -- workers will check this flag and skip processing */ + atomic_store_explicit(&cf_to_drop->marked_for_deletion, 1, memory_order_release); + + /* we shift remaining CFs down */ + for (int i = found_idx; i < db->num_column_families - 1; i++) + { + db->column_families[i] = db->column_families[i + 1]; + } + db->column_families[db->num_column_families - 1] = NULL; + db->num_column_families--; + + pthread_rwlock_unlock(&db->cf_list_lock); + + /* we sweep queued work targeting this CF out of both worker queues before waiting. + * without this, drop blocks on head-of-line, workers stuck on other CFs' long + * compactions cannot dequeue and skip this CF's items until they finish their + * current work. removing the items inline mirrors the worker's marked-for-deletion + * skip path so counters stay balanced */ + const size_t swept_flush = + queue_remove_if(db->flush_queue, tdb_cf_flush_match, cf_to_drop, tdb_cf_flush_release); + const size_t swept_compact = queue_remove_if(db->compaction_queue, tdb_cf_compaction_match, + cf_to_drop, tdb_cf_compaction_release); + if (swept_flush > 0 || swept_compact > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' drop swept %zu queued flush + %zu queued compaction items", + cf_to_drop->name, swept_flush, swept_compact); + } + + /* we wait for any in-progress flush to complete before freeing CF + * workers check marked_for_deletion and will skip new work, but we must + * wait for any work that started before we set the flag + * this wait must be unbounded -- the flush worker holds a live pointer to cf + * and will dereference it until flush I/O completes */ + int wait_count = 0; + while (tidesdb_is_flushing(cf_to_drop)) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + /* re-sweep the umt dispatcher may have enqueued new per-CF split work for this CF + * after our initial sweep (its phase 1 ran with cf still resolvable, phase 2 lands + * the split now). pulling it out here avoids the wait dragging while workers + * shuffle through unrelated work to dequeue and skip the marked items */ + queue_remove_if(db->flush_queue, tdb_cf_flush_match, cf_to_drop, tdb_cf_flush_release); + wait_count++; + if (wait_count % 100 == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' drop waiting for flush to complete (waited %d ms)", + cf_to_drop->name, wait_count * (TDB_CLOSE_FLUSH_WAIT_SLEEP_US / 1000)); + } + } + + /** we wait for any in-progress compaction to complete and for queued compaction work + * to drain -- the worker holds a live cf pointer and a queued item that has not yet + * been dequeued cannot see marked_for_deletion until the worker reaches it */ + wait_count = 0; + while (tidesdb_is_compacting(cf_to_drop)) + { + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + queue_remove_if(db->compaction_queue, tdb_cf_compaction_match, cf_to_drop, + tdb_cf_compaction_release); + wait_count++; + if (wait_count % 100 == 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, "CF '%s' drop waiting for compaction to complete (waited %d ms)", + cf_to_drop->name, wait_count * (TDB_COMPACTION_FLUSH_WAIT_SLEEP_US / 1000)); + } + } + + /* we drain in-flight commit-path writers before tearing the cf down -- a + * committer that bumped the active memtable's writers before + * marked_for_deletion became visible is still writing through the memtable + * and its WAL, both of which tidesdb_column_family_free is about to release. + * the seq_cst fence pairs with the one in the commit path between its + * writers bump and its marked_for_deletion check */ + tidesdb_memtable_t *drop_active_mt = atomic_load(&cf_to_drop->active_memtable); + if (drop_active_mt) + { + atomic_thread_fence(memory_order_seq_cst); + wait_count = 0; + while (atomic_load_explicit(&drop_active_mt->writers, memory_order_acquire) > 0) + { + usleep(TDB_REFCOUNT_DRAIN_SLEEP_US); + if (++wait_count % 100 == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' drop waiting for in-flight WAL writers to drain", + cf_to_drop->name); + } + } + } + + /* we drain readers pinning cf->active_memtable through the active_mt_readers + * epoch. tidesdb_column_family_free will release the cf struct that holds + * the counter, so a reader still mid try_ref would UAF on the counter as + * well as on the memtable */ + atomic_thread_fence(memory_order_seq_cst); + wait_count = 0; + while (atomic_load_explicit(&cf_to_drop->active_mt_readers, memory_order_acquire) > 0) + { + usleep(TDB_REFCOUNT_DRAIN_SLEEP_US); + if (++wait_count % 100 == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' drop waiting for active_memtable readers to drain", + cf_to_drop->name); + } + } + + /* drain deferred-free items for this cf's levels before column_family_free + * releases them. the reaper's periodic sweep could otherwise be holding + * items pointing at our levels in its locally-stolen list and UAF on the + * next iteration's array_readers load. reaper_thread_mutex serializes us + * with the sweep so we cannot race a mid-walk reaper */ + pthread_mutex_lock(&db->reaper_thread_mutex); + tidesdb_deferred_free_drain_for_cf(db, cf_to_drop); + pthread_mutex_unlock(&db->reaper_thread_mutex); + + /* we invalidate all block cache entries for this column family before freeing */ + tidesdb_invalidate_block_cache_for_cf(db, cf_to_drop->name); + + /* we delete all objects for this CF from object store */ + if (db->object_store) + { + char prefix[TDB_MAX_PATH_LEN]; + snprintf(prefix, sizeof(prefix), "%s/", cf_to_drop->name); + db->object_store->list(db->object_store->ctx, prefix, tdb_objstore_delete_listed_cb, + db->object_store); + } + + /* we drop the cf from the unified index map before removing its directory + * so a crash between the two leaves the map describing more than exists, + * never less -- a stale entry is harmless, a missing one is not */ + if (db->unified_mt.enabled) + { + tidesdb_unimap_remove(db, cf_to_drop->name); + } + + const int result = remove_directory(cf_to_drop->directory); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Deleted column family directory: %s (result: %d)", + cf_to_drop->directory, result); + + /* we sync parent directory to persist the directory removal */ + tdb_sync_directory(db->db_path); + + tidesdb_column_family_free(cf_to_drop); + + return TDB_SUCCESS; +} + +int tidesdb_drop_column_family(tidesdb_t *db, const char *name) +{ + if (!name) return TDB_ERR_INVALID_ARGS; + + return tidesdb_drop_column_family_internal(db, name, NULL); +} + +int tidesdb_delete_column_family(tidesdb_t *db, tidesdb_column_family_t *cf) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + return tidesdb_drop_column_family_internal(db, NULL, cf); +} + +int tidesdb_rename_column_family(tidesdb_t *db, const char *old_name, const char *new_name) +{ + if (!db || !old_name || !new_name) return TDB_ERR_INVALID_ARGS; + + /* we validate new name length */ + if (strlen(new_name) == 0 || strlen(new_name) >= TDB_MAX_CF_NAME_LEN) + { + return TDB_ERR_INVALID_ARGS; + } + + /** we check for same name */ + if (strcmp(old_name, new_name) == 0) + { + return TDB_SUCCESS; /* no-op */ + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Renaming column family %s -> %s", old_name, new_name); + + pthread_rwlock_wrlock(&db->cf_list_lock); + + /* we find the CF to rename */ + tidesdb_column_family_t *cf = tidesdb_get_column_family_internal(db, old_name); + + if (!cf) + { + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_NOT_FOUND; + } + + /* we check if new name already exists */ + if (tidesdb_get_column_family_internal(db, new_name)) + { + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_EXISTS; + } + + /* we mark CF for deletion to reject new writes while draining in-flight + * operations. the flag is cleared after rename completes. in unified mode + * this prevents new txn_put calls from targeting this CF. in per-CF mode + * it also prevents new memtable writes. */ + atomic_store_explicit(&cf->marked_for_deletion, 1, memory_order_release); + + /* in per-CF mode, we flush the active memtable to rotate the WAL. this ensures + * any in-flight commit that already loaded active_mt->wal finishes writing + * to the old WAL before we close it. we release cf_list_lock during flush + * so other CFs are not blocked. in unified mode the per-CF WAL is dormant + * (commits go through the unified WAL) so the flush is only needed to + * persist memtable data before directory rename. */ + pthread_rwlock_unlock(&db->cf_list_lock); + + /* we sweep queued compaction work targeting this CF out of the queue. a + * compaction enqueued by tidesdb_compact but not yet picked up by a worker + * is invisible to the is_compacting wait below, and would otherwise run + * after the rename closes the sstable handles, creating an orphan sstable + * whose file handle is never closed -- on windows that leaked handle blocks + * the directory from being removed. queued flush work is left in place + * because the rename relies on it to persist memtable data before the move */ + queue_remove_if(db->compaction_queue, tdb_cf_compaction_match, cf, tdb_cf_compaction_release); + + tidesdb_flush_memtable_internal(cf, 0, 1); + + /*** an unbounded flush wait matching tidesdb_drop_column_family -- the flush + ** worker holds live pointers to cf and will dereference them until flush + * I/O completes. a bounded wait risks use-after-free. */ + int wait_count = 0; + while (tidesdb_is_flushing(cf)) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + wait_count++; + if (wait_count % 100 == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' rename waiting for flush to complete (waited %d ms)", cf->name, + wait_count * (TDB_CLOSE_FLUSH_WAIT_SLEEP_US / 1000)); + } + } + + /* unbounded compaction wait */ + wait_count = 0; + while (tidesdb_is_compacting(cf)) + { + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + wait_count++; + if (wait_count % 100 == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' rename waiting for compaction to complete (waited %d ms)", + cf->name, wait_count * (TDB_COMPACTION_FLUSH_WAIT_SLEEP_US / 1000)); + } + } + + /* we drain flush queue so all pending work is done before closing handles */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 4; i++) + { + const size_t fq = db->flush_queue ? queue_size(db->flush_queue) : 0; + int pending = atomic_load_explicit(&db->flush_pending_count, memory_order_acquire); + if (fq == 0 && pending == 0) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + /* a flush completing in the drain above can enqueue a fresh compaction for + * this CF. we sweep and wait once more so no compaction is still queued or + * running when we close the sstable handles below */ + queue_remove_if(db->compaction_queue, tdb_cf_compaction_match, cf, tdb_cf_compaction_release); + while (tidesdb_is_compacting(cf)) + { + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + queue_remove_if(db->compaction_queue, tdb_cf_compaction_match, cf, + tdb_cf_compaction_release); + } + + pthread_rwlock_wrlock(&db->cf_list_lock); + + /* we invalidate all block cache entries for the old CF name before renaming */ + tidesdb_invalidate_block_cache_for_cf(db, old_name); + + /* we build new directory path */ + char new_directory[MAX_FILE_PATH_LENGTH]; + int written = snprintf(new_directory, sizeof(new_directory), "%s%s%s", db->db_path, + PATH_SEPARATOR, new_name); + if (written < 0 || (size_t)written >= sizeof(new_directory)) + { + atomic_store_explicit(&cf->marked_for_deletion, 0, memory_order_release); + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_INVALID_ARGS; + } + + struct STAT_STRUCT st; + if (STAT_FUNC(new_directory, &st) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Cannot rename CF '%s' to '%s', destination directory already exists", + old_name, new_name); + atomic_store_explicit(&cf->marked_for_deletion, 0, memory_order_release); + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_EXISTS; + } + + /*** we close the active memtable's WAL. a concurrent tidesdb_txn_commit can + ** still be writing through this WAL handle, so we drain in-flight writers + ** first. marked_for_deletion is already set, so the commit path refuses to + ** bump writers on this cf and a committer that bumped writers before the + * flag became visible decrements it and bails on its way out. */ + tidesdb_memtable_t *active_mt = atomic_load(&cf->active_memtable); + block_manager_t *old_wal = NULL; + uint64_t old_wal_id = 0; + if (active_mt && active_mt->wal) + { + /* the seq_cst fence pairs with the one the commit path runs between its + * writers bump and its marked_for_deletion check, so a writer we do not + * observe here is guaranteed to observe the flag and back off */ + atomic_thread_fence(memory_order_seq_cst); + int wal_drain_iters = 0; + while (atomic_load_explicit(&active_mt->writers, memory_order_acquire) > 0) + { + usleep(TDB_REFCOUNT_DRAIN_SLEEP_US); + if (++wal_drain_iters % 100 == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' rename waiting for in-flight WAL writers to drain", + cf->name); + } + } + + old_wal = active_mt->wal; + old_wal_id = active_mt->id; + block_manager_close(old_wal); + active_mt->wal = NULL; + } + + /* we close all sst file handles before rename (required on Windows) */ + const int num_levels = atomic_load(&cf->num_active_levels); + for (int lvl = 0; lvl < num_levels; lvl++) + { + tidesdb_level_t *level = cf->levels[lvl]; + if (!level) continue; + + const int num_sst = atomic_load(&level->num_sstables); + tidesdb_sstable_t **sstables = atomic_load(&level->sstables); + for (int s = 0; s < num_sst; s++) + { + tidesdb_sstable_t *sst = sstables[s]; + if (!sst) continue; + + /* num_open_sstables is keyed on the klog; a klog-open sstable counts one, so dropping + * its handle here must decrement or the rename leaks the count for every open sstable + */ + const int had_open_klog = (sst->klog_bm != NULL); + if (sst->klog_bm) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + } + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + if (had_open_klog) atomic_fetch_sub(&cf->db->num_open_sstables, 1); + } + } + + /* we close manifest file handle before rename (required on Windows) */ + if (cf->manifest) + { + pthread_rwlock_wrlock(&cf->manifest->lock); + if (cf->manifest->fp) + { + fclose(cf->manifest->fp); + cf->manifest->fp = NULL; + } + pthread_rwlock_unlock(&cf->manifest->lock); + } + + /* we rename directory on disk (use atomic_rename_dir for Windows compatibility) */ + if (atomic_rename_dir(cf->directory, new_directory) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to rename directory %s to %s, %s", cf->directory, + new_directory, strerror(errno)); + /* we try to reopen WAL at old location */ + if (old_wal) + { + char wal_path[MAX_FILE_PATH_LENGTH]; + snprintf(wal_path, sizeof(wal_path), + "%s" PATH_SEPARATOR TDB_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, cf->directory, + TDB_U64_CAST(old_wal_id)); + block_manager_t *reopened = NULL; + block_manager_open(&reopened, wal_path, cf->config.sync_mode); + atomic_store_explicit(&active_mt->wal, reopened, memory_order_release); + } + atomic_store_explicit(&cf->marked_for_deletion, 0, memory_order_release); + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_IO; + } + + /* we reopen WAL at new location */ + if (old_wal) + { + char new_wal_path[MAX_FILE_PATH_LENGTH]; + int wal_written = snprintf(new_wal_path, sizeof(new_wal_path), + "%s" PATH_SEPARATOR TDB_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, + new_directory, TDB_U64_CAST(old_wal_id)); + if (wal_written > 0 && (size_t)wal_written < sizeof(new_wal_path)) + { + block_manager_t *reopened = NULL; + if (block_manager_open(&reopened, new_wal_path, cf->config.sync_mode) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to reopen WAL at %s after rename", + new_wal_path); + } + atomic_store_explicit(&active_mt->wal, reopened, memory_order_release); + } + } + + /* we update CF name */ + char *new_name_copy = tdb_strdup(new_name); + if (!new_name_copy) + { + /* we try to revert directory rename */ + atomic_rename_dir(new_directory, cf->directory); + atomic_store_explicit(&cf->marked_for_deletion, 0, memory_order_release); + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_MEMORY; + } + + /* we update CF directory */ + char *new_dir_copy = tdb_strdup(new_directory); + if (!new_dir_copy) + { + free(new_name_copy); + /* we try to revert directory rename */ + atomic_rename_dir(new_directory, cf->directory); + atomic_store_explicit(&cf->marked_for_deletion, 0, memory_order_release); + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_MEMORY; + } + + /* we swap in new values */ + char *old_name_ptr = cf->name; + char *old_dir_ptr = cf->directory; + cf->name = new_name_copy; + cf->directory = new_dir_copy; + + /*** we update all sst file paths in all levels + ** note that we already hold cf_list_lock and waited for flush/compaction to complete, + * so it's safe to modify sstable paths without additional locking */ + for (int lvl = 0; lvl < num_levels; lvl++) + { + tidesdb_level_t *level = cf->levels[lvl]; + if (!level) continue; + + const int num_sst = atomic_load(&level->num_sstables); + tidesdb_sstable_t **sstables = atomic_load(&level->sstables); + for (int s = 0; s < num_sst; s++) + { + tidesdb_sstable_t *sst = sstables[s]; + if (!sst) continue; + + /* we build new klog path */ + char new_klog_path[MAX_FILE_PATH_LENGTH]; + int path_written = snprintf(new_klog_path, sizeof(new_klog_path), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX + "%d_" TDB_U64_FMT TDB_SSTABLE_KLOG_EXT, + new_directory, lvl + 1, TDB_U64_CAST(sst->id)); + if (path_written > 0 && (size_t)path_written < sizeof(new_klog_path)) + { + char *new_klog = tdb_strdup(new_klog_path); + if (new_klog) + { + free(sst->klog_path); + sst->klog_path = new_klog; + + /* recompute klog_filename as it points into klog_path */ + const char *last_fwd = strrchr(new_klog, '/'); + const char *last_back = strrchr(new_klog, '\\'); + const char *last_sep = (last_fwd > last_back) ? last_fwd : last_back; + sst->klog_filename = last_sep ? last_sep + 1 : new_klog; + } + } + + /* we build new vlog path */ + char new_vlog_path[MAX_FILE_PATH_LENGTH]; + path_written = snprintf(new_vlog_path, sizeof(new_vlog_path), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX + "%d_" TDB_U64_FMT TDB_SSTABLE_VLOG_EXT, + new_directory, lvl + 1, TDB_U64_CAST(sst->id)); + if (path_written > 0 && (size_t)path_written < sizeof(new_vlog_path)) + { + char *new_vlog = tdb_strdup(new_vlog_path); + if (new_vlog) + { + free(sst->vlog_path); + sst->vlog_path = new_vlog; + } + } + } + } + + /* we update config file with new name */ + char config_path[MAX_FILE_PATH_LENGTH]; + written = + snprintf(config_path, sizeof(config_path), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + new_directory); + if (written > 0 && (size_t)written < sizeof(config_path)) + { + tidesdb_cf_config_save_to_ini(config_path, new_name, &cf->config); + } + + /* we update manifest path, thus must update internal path before commit! */ + if (cf->manifest) + { + char manifest_path[MAX_FILE_PATH_LENGTH]; + written = snprintf(manifest_path, sizeof(manifest_path), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_MANIFEST_NAME, new_directory); + if (written > 0 && (size_t)written < sizeof(manifest_path)) + { + /* we update the manifest's internal path to the new location + *** note -- fp was already closed before rename for Windows compatibility */ + pthread_rwlock_wrlock(&cf->manifest->lock); + memcpy(cf->manifest->path, manifest_path, sizeof(manifest_path)); + pthread_rwlock_unlock(&cf->manifest->lock); + + /* we commit manifest to new location to ensure it's written */ + tidesdb_manifest_commit(cf->manifest, manifest_path); + } + } + + pthread_rwlock_unlock(&db->cf_list_lock); + + free(old_name_ptr); + free(old_dir_ptr); + + /* we clear the deletion mark now that rename is complete */ + atomic_store_explicit(&cf->marked_for_deletion, 0, memory_order_release); + + /* the unified index map is keyed on cf name, so the rename must follow */ + if (db->unified_mt.enabled) + { + tidesdb_unimap_rename(db, old_name, new_name); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Successfully renamed column family %s -> %s", old_name, new_name); + + return TDB_SUCCESS; +} + +/** + * tidesdb_get_column_family_internal + * looks up a column family by name without locking or open-check + * @param db database handle + * @param name column family name + * @return pointer to column family, or NULL if not found + */ +static tidesdb_column_family_t *tidesdb_get_column_family_internal(tidesdb_t *db, const char *name) +{ + if (!db || !name) return NULL; + tidesdb_column_family_t *result = NULL; + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i] && strcmp(db->column_families[i]->name, name) == 0) + { + result = db->column_families[i]; + break; + } + } + return result; +} + +tidesdb_column_family_t *tidesdb_get_column_family(tidesdb_t *db, const char *name) +{ + if (!db || !name) return NULL; + + const int wait_result = wait_for_open(db); + if (wait_result != TDB_SUCCESS) return NULL; + + pthread_rwlock_rdlock(&db->cf_list_lock); + tidesdb_column_family_t *result = NULL; + + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i] && strcmp(db->column_families[i]->name, name) == 0) + { + result = db->column_families[i]; + break; + } + } + + pthread_rwlock_unlock(&db->cf_list_lock); + return result; +} + +/** + * wait_for_open + * blocks until the database is fully open and recovery is complete + * @param db database handle + * @return TDB_SUCCESS when open, TDB_ERR_INVALID_DB on timeout or close + */ +static int wait_for_open(tidesdb_t *db) +{ + /*** we wait for database to open and finish recovery, but timeout if it's closing + ** this prevents threads from hanging forever when database is being closed + * and prevents transactions from starting during recovery */ + int wait_count = 0; + + while (!atomic_load_explicit(&db->is_open, memory_order_acquire) || + atomic_load_explicit(&db->is_recovering, memory_order_acquire)) + { + if (wait_count >= TDB_OPENING_WAIT_MAX_MS) + { + /** the database is not open and hasnt opened after timeout + * it's likely closing or closed */ + return TDB_ERR_INVALID_DB; + } + + /** we spin-wait with small sleep to avoid busy loop + * we use same interval as transaction wait for consistency */ + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + wait_count++; + } + + return TDB_SUCCESS; +} + +int tidesdb_list_column_families(tidesdb_t *db, char ***names, int *count) +{ + if (!db || !names || !count) return TDB_ERR_INVALID_ARGS; + + pthread_rwlock_rdlock(&db->cf_list_lock); + + *count = db->num_column_families; + if (*count == 0) + { + *names = NULL; + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_SUCCESS; + } + + *names = malloc(sizeof(char *) * (*count)); + if (!*names) + { + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_MEMORY; + } + + for (int i = 0; i < *count; i++) + { + if (db->column_families[i] && db->column_families[i]->name) + { + (*names)[i] = tdb_strdup(db->column_families[i]->name); + if (!(*names)[i]) + { + for (int j = 0; j < i; j++) + { + free((*names)[j]); + } + free(*names); + *names = NULL; + *count = 0; + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_ERR_MEMORY; + } + } + else + { + (*names)[i] = NULL; + } + } + + pthread_rwlock_unlock(&db->cf_list_lock); + return TDB_SUCCESS; +} + +int tidesdb_flush_memtable(tidesdb_column_family_t *cf) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + /* in unified memtable mode the cf->active_memtable is a per-cf wrapper but + * the real active memtable lives on db->unified_mt. we rotate the unified + * memtable so the current contents enqueue for flush, and then fall through + * to the per-cf flush path to cover any stragglers or immutable wrappers. + * the rotate function requires unified_mt.is_flushing admission to prevent + * concurrent rotators from enqueueing the same memtable twice. if CAS fails + * another rotator is in progress and will cover this flush */ + if (cf->db && cf->db->config.unified_memtable) + { + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&cf->db->unified_mt.is_flushing, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + const int rot_rc = tidesdb_unified_memtable_rotate(cf->db); + atomic_store_explicit(&cf->db->unified_mt.is_flushing, 0, memory_order_release); + if (rot_rc != TDB_SUCCESS && rot_rc != TDB_ERR_LOCKED) + { + return rot_rc; + } + } + } + + return tidesdb_flush_memtable_internal(cf, 0, 1); +} + +int tidesdb_is_flushing(tidesdb_column_family_t *cf) +{ + if (!cf) return 0; + + /* is_flushing covers the memtable-swap-to-enqueue window. flush_pending_count + * is incremented before enqueue and decremented after the flush worker fully + * completes, so it covers queued + in-flight work with no TOCTOU gaps. the + * per-CF counter lets drop_column_family wait only for this CF's pending + * work instead of every CF's */ + if (atomic_load_explicit(&cf->is_flushing, memory_order_acquire) != 0) return 1; + return atomic_load_explicit(&cf->flush_pending_count, memory_order_acquire) > 0; +} + +int tidesdb_is_compacting(tidesdb_column_family_t *cf) +{ + if (!cf) return 0; + + if (atomic_load_explicit(&cf->is_compacting, memory_order_acquire) != 0) return 1; + return atomic_load_explicit(&cf->compaction_pending_count, memory_order_acquire) > 0; +} + +/** + * tidesdb_flush_memtable_internal + * rotates the active memtable and enqueues the old one for flush to disk + * creates a new memtable + WAL, swaps the active pointer, publishes immutable snapshot + * @param cf column family + * @param already_holds_lock 1 if caller already holds is_flushing lock + * @param force 1 to flush regardless of size threshold + * @return TDB_SUCCESS or error code + */ +static int tidesdb_flush_memtable_internal(tidesdb_column_family_t *cf, + const int already_holds_lock, const int force) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + /* we check if CF is marked for deletion -- skip flush if so */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + return TDB_SUCCESS; + } + + if (!already_holds_lock) + { + int expected = 0; + if (!atomic_compare_exchange_strong_explicit(&cf->is_flushing, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + /* another rotate is in progress for this cf, we skip this attempt */ + return TDB_SUCCESS; + } + } + + /*** is_flushing now serialises only the rotate critical section. the global + ** active_flushes counter caps how many memtable flushes can be in flight + * across all column families so a hot cf cannot starve workers nor make + *** transient memory grow without bound when many cfs flush at once. */ + int slot_max = cf->db->config.max_concurrent_flushes; + if (slot_max <= 0) slot_max = TDB_DEFAULT_MAX_CONCURRENT_FLUSHES; + int prev_slots = atomic_fetch_add_explicit(&cf->db->active_flushes, 1, memory_order_acq_rel); + if (prev_slots >= slot_max) + { + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + /* mark the flush deferred so the reaper retries it once a slot frees -- + * a deferred flush must not be left waiting for a future write to + * re-trigger it, or an idle cf could sit over its threshold forever */ + atomic_store_explicit(&cf->flush_deferred, 1, memory_order_release); + if (!already_holds_lock) + { + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + } + if (tdb_log_throttle(cf->db, &cf->last_backpressure_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' deferring flush, global cap %d reached", cf->name, + slot_max); + return TDB_SUCCESS; + } + + /* a flush slot was acquired -- any pending deferral for this cf is now served */ + atomic_store_explicit(&cf->flush_deferred, 0, memory_order_release); + + /* we check again after acquiring is_flushing in case drop happened between checks */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + if (!already_holds_lock) + { + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + } + return TDB_SUCCESS; + } + + /* we update cached_current_time to ensure TTL checks during flush use fresh time */ + atomic_store(&cf->db->cached_current_time, tdb_get_current_time()); + + tidesdb_memtable_t *old_mt = atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + skip_list_t *old_memtable = old_mt ? old_mt->skip_list : NULL; + size_t current_size = old_memtable ? (size_t)skip_list_get_size(old_memtable) : 0; + int current_entries = old_memtable ? skip_list_count_entries(old_memtable) : 0; + + if (current_entries == 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' memtable is empty, skipping flush", cf->name); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_SUCCESS; + } + + /* we only check size threshold if not forcing flush */ + if (!force && current_size < cf->config.write_buffer_size) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' memtable size %zu < threshold %zu and force=0, skipping flush", + cf->name, current_size, cf->config.write_buffer_size); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_SUCCESS; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' is flushing memtable (entries: %d, size: %zu bytes / %.2f MB, " + "threshold: %zu bytes " + "/ %.2f MB)", + cf->name, current_entries, current_size, current_size / (1024.0 * 1024.0), + cf->config.write_buffer_size, cf->config.write_buffer_size / (1024.0 * 1024.0)); + + block_manager_t *old_wal = old_mt ? old_mt->wal : NULL; + uint64_t sst_id = atomic_fetch_add(&cf->next_sstable_id, 1); + + /* if using TDB_SYNC_INTERVAL, sync the old WAL before rotation + * this essentially ensures WAL durability before it becomes immutable */ + if (cf->config.sync_mode == TDB_SYNC_INTERVAL && old_wal) + { + block_manager_escalate_fsync(old_wal); + } + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + if (tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx) != 0) + { + /* comparator not found, use default memcmp */ + comparator_fn = skip_list_comparator_memcmp; + comparator_ctx = NULL; + } + + /* we check marked_for_deletion again before allocating resources + * this prevents leaking memtable/WAL if CF is being dropped */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' is marked for deletion, aborting flush before resource allocation", + cf->name); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_SUCCESS; + } + + skip_list_t *new_memtable; + if (skip_list_new_with_arena(&new_memtable, cf->config.skip_list_max_level, + cf->config.skip_list_probability, comparator_fn, comparator_ctx, + &cf->db->cached_current_time, + cf->config.write_buffer_size * 2) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to create new memtable", cf->name); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_MEMORY; + } + + /* in unified memtable mode, per-CF WALs are not used */ + block_manager_t *new_wal = NULL; + if (!cf->db->unified_mt.enabled) + { + const uint64_t wal_id = sst_id + 1; + char wal_path[MAX_FILE_PATH_LENGTH]; + snprintf(wal_path, sizeof(wal_path), + "%s" PATH_SEPARATOR TDB_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, cf->directory, + TDB_U64_CAST(wal_id)); + + if (tidesdb_bm_open(cf->db, &new_wal, wal_path, convert_sync_mode(cf->config.sync_mode)) != + 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to open new WAL '%s', %s", cf->name, + wal_path, strerror(errno)); + skip_list_free(new_memtable); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_IO; + } + + if (block_manager_truncate(new_wal) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to truncate new WAL, %s", cf->name, + wal_path); + block_manager_close(new_wal); + skip_list_free(new_memtable); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_IO; + } + } + + /* we sync CF directory to persist new WAL file entry */ + if (new_wal) tdb_sync_directory(cf->directory); + + /* we create new tidesdb_memtable_t structure pairing skip_list and wal */ + tidesdb_memtable_t *new_mt = malloc(sizeof(tidesdb_memtable_t)); + if (!new_mt) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to allocate new memtable structure", cf->name); + skip_list_free(new_memtable); + if (new_wal) block_manager_close(new_wal); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_MEMORY; + } + new_mt->skip_list = new_memtable; + new_mt->wal = new_wal; /* NULL in unified mode */ + new_mt->id = sst_id + 1; + new_mt->generation = old_mt ? old_mt->generation + 1 : 1; + atomic_init(&new_mt->refcount, 1); + atomic_init(&new_mt->writers, 0); + atomic_init(&new_mt->flushed, 0); + + /** we check marked_for_deletion again after allocating resources + * this handles the race where CF is dropped while we were allocating */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' is marked for deletion, cleaning up newly allocated resources", + cf->name); + skip_list_free(new_memtable); + if (new_wal) block_manager_close(new_wal); + free(new_mt); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_SUCCESS; + } + + /* we reuse old_mt directly as the immutable memtable instead of allocating + * a new structure. another thread that loaded cf->active_memtable before + * the swap below still holds the old_mt pointer and will try_ref it via + * the active_mt_readers epoch. the immutable-cleanup loop drains that + * epoch before free()ing the struct so the late try_ref's refcount load + * is on live memory (and correctly returns 0 if cleanup already CAS'd + * refcount to 0) */ + tidesdb_immutable_memtable_t *immutable = old_mt; + if (!immutable) + { + /** no old memtable to flush -- this shouldnt happen but handle gracefully + * store new_mt as active before returning so the CF has a usable memtable */ + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' no old memtable to flush", cf->name); + atomic_store_explicit(&cf->active_memtable, new_mt, memory_order_release); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_SUCCESS; + } + + /** old_mt already has correct skip_list, wal, id, generation, and refcount + * just reset flushed flag */ + atomic_store_explicit(&immutable->flushed, 0, memory_order_release); + + /* we enforce a hard cap on the immutable queue to prevent truly unbounded growth. + * if the queue is already at the hard cap, we block briefly to let the flush worker + * drain it. this is a last-resort safety net -- normal backpressure should prevent + * reaching this point */ + { + const size_t hard_cap = tdb_cf_immutable_hard_cap(cf); + const size_t imm_qsize = queue_size(cf->immutable_memtables); + if (imm_qsize >= hard_cap) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' immutable queue at hard cap %zu >= %zu, blocking until drained", + cf->name, imm_qsize, hard_cap); + int wait_iters = 0; + while (queue_size(cf->immutable_memtables) >= hard_cap && + wait_iters < TDB_IMMUTABLE_HARD_CAP_MAX_WAIT) + { + usleep(TDB_IMMUTABLE_HARD_CAP_WAIT_US); + wait_iters++; + } + if (wait_iters >= TDB_IMMUTABLE_HARD_CAP_MAX_WAIT) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' immutable queue hard cap wait timeout after %d ms", cf->name, + wait_iters * (TDB_IMMUTABLE_HARD_CAP_WAIT_US / 1000)); + } + } + } + + /* we enqueue immutable and publish snapshot before swapping the active pointer. + * this eliminates a visibility gap where the old memtable is neither active nor in + * the immutable snapshot. readers seeing old_mt in both active and immutable is + * harmless because active is always checked first. old_mt has flushed=0, so the + * cleanup code will not free it while it is still the active memtable. + * is_flushing CAS ensures only one flush runs per CF at a time. */ + if (queue_enqueue(cf->immutable_memtables, immutable) != 0) + { + TDB_DEBUG_LOG( + TDB_LOG_ERROR, + "CF '%s' CRITICAL, failed to enqueue immutable memtable - data in WAL for recovery", + cf->name); + + /* we free the skip_list and wal -- data is still in WAL for recovery on restart */ + skip_list_free(old_memtable); + if (old_wal) block_manager_close(old_wal); + free(immutable); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_MEMORY; + } + + (void)tidesdb_imm_snap_publish(cf); + + /* we swap active_memtable pointer after publishing the immutable snapshot. + * new writers will use the new memtable. the old memtable is already visible + * in the immutable snapshot, so readers will always find committed data. + * no need to wait for old memtable refcount to drain here because: + * -- old memtable is now immutable and enqueued for background flush + * -- refcount naturally drains as in-flight writers finish + * -- tidesdb_immutable_memtable_unref() handles cleanup when refcount hits 0 */ + atomic_store_explicit(&cf->active_memtable, new_mt, memory_order_release); + atomic_thread_fence(memory_order_seq_cst); + + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' memtable swapped, allocating flush work for SSTable %" PRIu64, cf->name, + sst_id); + + tidesdb_flush_work_t *work = malloc(sizeof(tidesdb_flush_work_t)); + if (!work) + { + /** immutable is already queued but flush will never happen + * we must clean it up to prevent memory leak */ + tidesdb_immutable_memtable_unref(immutable); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_MEMORY; + } + + work->cf = cf; + work->imm = immutable; + work->sst_id = sst_id; + work->unified_sl = NULL; + work->unified_barrier = NULL; + + tidesdb_immutable_memtable_ref(immutable); + + size_t queue_size_before = queue_size(cf->db->flush_queue); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' is enqueueing flush work for SSTable %" PRIu64 + " (queue size before: %zu)", + cf->name, sst_id, queue_size_before); + + /*** we increment flush_pending_count before enqueue so that checkpoint/close + ** can never see a window where the item is in the queue (or dequeued by a worker) + * but the counter is still 0. the worker decrements after completing I/O. + * per-CF mirror lets drop_column_family wait only for this CF's pending work */ + atomic_fetch_add_explicit(&cf->db->flush_pending_count, 1, memory_order_release); + atomic_fetch_add_explicit(&cf->flush_pending_count, 1, memory_order_release); + + /** we retry enqueue with backoff -- we must not lose this flush work + * the WAL has been rotated and data is only in the immutable memtable */ + int enqueue_attempts = 0; + while (queue_enqueue(cf->db->flush_queue, work) != 0) + { + enqueue_attempts++; + if (enqueue_attempts >= TDB_FLUSH_ENQUEUE_MAX_ATTEMPTS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' failed to enqueue flush work after %d attempts for SSTable " + "%" PRIu64, + cf->name, TDB_FLUSH_ENQUEUE_MAX_ATTEMPTS, sst_id); + tidesdb_immutable_memtable_unref(immutable); /* remove work ref */ + free(work); + atomic_fetch_sub_explicit(&cf->db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->db->active_flushes, 1, memory_order_release); + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + return TDB_ERR_MEMORY; + } + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' flush queue full, retry %d/%d for SSTable %" PRIu64, + cf->name, enqueue_attempts, TDB_FLUSH_ENQUEUE_MAX_ATTEMPTS, sst_id); + usleep(TDB_FLUSH_ENQUEUE_BACKOFF_US); + } + + const size_t queue_size_after = queue_size(cf->db->flush_queue); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' has successfully enqueued flush work for SSTable %" PRIu64 + " (queue size after: %zu)", + cf->name, sst_id, queue_size_after); + + /* rotate critical section is done. the worker holds the active_flushes slot + * until the sstable is committed and releases it from the flush worker loop. */ + if (!already_holds_lock) + { + atomic_store_explicit(&cf->is_flushing, 0, memory_order_release); + } + return TDB_SUCCESS; +} + +static int tidesdb_enqueue_compaction(tidesdb_column_family_t *cf, int full_compaction) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + if (atomic_load_explicit(&cf->is_compacting, memory_order_acquire)) + { + /* compaction already running. arm a follow-up so the worker + * re-enqueues once it finishes this round, otherwise a trigger that + * arrived mid-compaction is silently coalesced into nothing */ + atomic_store_explicit(&cf->compaction_armed, 1, memory_order_release); + return TDB_SUCCESS; + } + + /* we enqueue compaction work -- calloc so the steer fields default to zero + * (no tombstone steering) */ + tidesdb_compaction_work_t *work = calloc(1, sizeof(tidesdb_compaction_work_t)); + if (!work) + { + return TDB_ERR_MEMORY; + } + + work->cf = cf; + work->full_compaction = full_compaction; + atomic_fetch_add_explicit(&cf->compaction_pending_count, 1, memory_order_release); + if (queue_enqueue(cf->db->compaction_queue, work) != 0) + { + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + free(work); + return TDB_ERR_MEMORY; + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_compact_internal + * shared body for manual full compaction. blocking=0 enqueues and returns + * immediately, the auto-trigger and reaper paths use this shape. blocking=1 + * parks the caller on a per-call done signal that the worker fires on every + * exit path that consumes the work item, and never coalesces against an + * in-flight compaction -- the caller's request runs as its own work item + * @param cf column family + * @param full_compaction 1 for a true full merge, 0 for geometry-driven + * @param blocking 1 to wait until the work item is serviced + * @return TDB_SUCCESS once the work has been serviced (blocking) or enqueued + * (non-blocking); error codes on alloc/queue failure + */ +static int tidesdb_compact_internal(tidesdb_column_family_t *cf, int full_compaction, int blocking) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + if (!blocking) return tidesdb_enqueue_compaction(cf, full_compaction); + + pthread_mutex_t done_mu; + pthread_cond_t done_cv; + _Atomic(int) done_flag; + pthread_mutex_init(&done_mu, NULL); + pthread_cond_init(&done_cv, NULL); + atomic_init(&done_flag, 0); + + tidesdb_compaction_work_t *work = calloc(1, sizeof(tidesdb_compaction_work_t)); + if (!work) + { + pthread_cond_destroy(&done_cv); + pthread_mutex_destroy(&done_mu); + return TDB_ERR_MEMORY; + } + work->cf = cf; + work->full_compaction = full_compaction; + work->done_mu = &done_mu; + work->done_cv = &done_cv; + work->done_flag = &done_flag; + + atomic_fetch_add_explicit(&cf->compaction_pending_count, 1, memory_order_release); + if (queue_enqueue(cf->db->compaction_queue, work) != 0) + { + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + free(work); + pthread_cond_destroy(&done_cv); + pthread_mutex_destroy(&done_mu); + return TDB_ERR_MEMORY; + } + + pthread_mutex_lock(&done_mu); + while (!atomic_load_explicit(&done_flag, memory_order_acquire)) + pthread_cond_wait(&done_cv, &done_mu); + pthread_mutex_unlock(&done_mu); + pthread_cond_destroy(&done_cv); + pthread_mutex_destroy(&done_mu); + return TDB_SUCCESS; +} + +int tidesdb_compact(tidesdb_column_family_t *cf) +{ + /* manual full compaction. merges every level into the largest so all + * garbage (tombstones, single-delete pairs, superseded puts) is + * reclaimed. blocks until the worker has finished servicing the request, + * including any in-flight compaction the worker is already running on + * this cf */ + return tidesdb_compact_internal(cf, 1, 1); +} + +/** + * tidesdb_compact_steer_to_bottom + * enqueues a tombstone-steered compaction -- the worker will targeted-merge the + * [min_key, max_key] range down to the largest level so a tombstone-dense + * sstable's regular tombstones reach where they can finally drop. takes + * ownership of the malloc'd min_key/max_key copies (worker frees them, or this + * frees them on an enqueue failure). a no-op if either key copy is missing or + * a compaction is already running. + * @param cf the column family + * @param min_key malloc'd copy of the dense sstable's min key + * @param min_key_size size of min_key + * @param max_key malloc'd copy of the dense sstable's max key + * @param max_key_size size of max_key + * @return TDB_SUCCESS (enqueued or skipped), TDB_ERR_MEMORY on alloc failure + */ +static int tidesdb_compact_steer_to_bottom(tidesdb_column_family_t *cf, uint8_t *min_key, + size_t min_key_size, uint8_t *max_key, + size_t max_key_size) +{ + if (!cf || !min_key || !max_key || min_key_size == 0 || max_key_size == 0) + { + free(min_key); + free(max_key); + return TDB_SUCCESS; + } + + if (atomic_load_explicit(&cf->is_compacting, memory_order_acquire)) + { + /* compaction already running -- skip, the keys are no longer needed. + * arm a follow-up so the worker schedules a geometry round once it + * finishes; the density witness state cannot survive the drop but + * the next flush's witness check will re-detect it if still dense */ + atomic_store_explicit(&cf->compaction_armed, 1, memory_order_release); + free(min_key); + free(max_key); + return TDB_SUCCESS; + } + + tidesdb_compaction_work_t *work = calloc(1, sizeof(tidesdb_compaction_work_t)); + if (!work) + { + free(min_key); + free(max_key); + return TDB_ERR_MEMORY; + } + + work->cf = cf; + work->steer_to_bottom = 1; + work->steer_min_key = min_key; + work->steer_min_key_size = min_key_size; + work->steer_max_key = max_key; + work->steer_max_key_size = max_key_size; + + atomic_fetch_add_explicit(&cf->compaction_pending_count, 1, memory_order_release); + if (queue_enqueue(cf->db->compaction_queue, work) != 0) + { + atomic_fetch_sub_explicit(&cf->compaction_pending_count, 1, memory_order_release); + free(work->steer_min_key); + free(work->steer_max_key); + free(work); + return TDB_ERR_MEMORY; + } + + return TDB_SUCCESS; +} + +/** + * tdb_range_overlap_check + * checks whether an sstable's [min_key, max_key] intersects [start_key, end_key). + * NULL endpoints mean unbounded on that side. uses the cf comparator so custom + * orderings behave correctly. + * + * @return 1 if the sstable overlaps the range, 0 otherwise + */ +static int tdb_range_overlap_check(skip_list_comparator_fn cmp_fn, void *cmp_ctx, + const uint8_t *sst_min, size_t sst_min_size, + const uint8_t *sst_max, size_t sst_max_size, + const uint8_t *start_key, size_t start_key_size, + const uint8_t *end_key, size_t end_key_size) +{ + if (!sst_min || !sst_max) return 0; + + /* sst_max must be >= start_key (or start_key unbounded) */ + if (start_key) + { + if (cmp_fn(sst_max, sst_max_size, start_key, start_key_size, cmp_ctx) < 0) return 0; + } + /* sst_min must be < end_key (or end_key unbounded) */ + if (end_key) + { + if (cmp_fn(sst_min, sst_min_size, end_key, end_key_size, cmp_ctx) >= 0) return 0; + } + return 1; +} + +/** + * tidesdb_compact_range_internal + * collects every sstable whose key range overlaps [start_key, end_key) and + * targeted-merges them. target_level_override >= 0 forces that target level -- + * used to steer a tombstone-dense range down to the largest level, the one + * place regular (non single-delete) tombstones can finally drop. a negative + * override keeps the default of merging into max_input_level. + * + * @param cf the column family + * @param start_key range start (NULL = unbounded) + * @param start_key_size size of start_key + * @param end_key range end, exclusive (NULL = unbounded) + * @param end_key_size size of end_key + * @param target_level_override forced 0-based target level, or < 0 for default + * @return TDB_SUCCESS or error code + */ +static int tidesdb_compact_range_internal(tidesdb_column_family_t *cf, const uint8_t *start_key, + size_t start_key_size, const uint8_t *end_key, + size_t end_key_size, int target_level_override) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + if (!start_key && !end_key) return TDB_ERR_INVALID_ARGS; + if (start_key && start_key_size == 0) return TDB_ERR_INVALID_ARGS; + if (end_key && end_key_size == 0) return TDB_ERR_INVALID_ARGS; + + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + return TDB_ERR_INVALID_ARGS; + + /** we wait briefly for any in-progress compaction to drain so we don't immediately + * reject when the system is otherwise idle */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (!atomic_load_explicit(&cf->is_compacting, memory_order_acquire)) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + int expected = 0; + if (!atomic_compare_exchange_strong_explicit(&cf->is_compacting, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + return TDB_ERR_LOCKED; + } + + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_ERR_INVALID_ARGS; + } + + atomic_store(&cf->db->cached_current_time, tdb_get_current_time()); + + /* we force flush so any in-memory data joins the merge */ + tidesdb_flush_memtable_internal(cf, 0, 1); + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (queue_size(cf->db->flush_queue) == 0 && + !atomic_load_explicit(&cf->is_flushing, memory_order_acquire)) + { + break; + } + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + skip_list_comparator_fn cmp_fn = NULL; + void *cmp_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &cmp_fn, &cmp_ctx); + + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + /* we collect every sstable whose min/max key range overlaps the user range, ref each + ** one so cleanup_merged_sstables can hand the ref back when the merge finishes */ + tidesdb_sstable_t **inputs = NULL; + int input_capacity = 0; + int input_count = 0; + int min_input_level = num_levels; + int max_input_level = -1; + + for (int lv = 0; lv < num_levels; lv++) + { + tidesdb_level_t *lvl = cf->levels[lv]; + if (!lvl) continue; + + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + + const int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + tidesdb_sstable_t **ssts = atomic_load_explicit(&lvl->sstables, memory_order_acquire); + + for (int i = 0; ssts && i < num_ssts; i++) + { + tidesdb_sstable_t *sst = ssts[i]; + if (!sst) continue; + if (!tdb_range_overlap_check(cmp_fn, cmp_ctx, sst->min_key, sst->min_key_size, + sst->max_key, sst->max_key_size, start_key, start_key_size, + end_key, end_key_size)) + continue; + + if (input_count == input_capacity) + { + int new_cap = input_capacity == 0 ? TDB_STACK_SSTS : input_capacity * 2; + tidesdb_sstable_t **bigger = + realloc(inputs, (size_t)new_cap * sizeof(tidesdb_sstable_t *)); + if (!bigger) + { + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + for (int j = 0; j < input_count; j++) tidesdb_sstable_unref(cf->db, inputs[j]); + free(inputs); + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_ERR_MEMORY; + } + inputs = bigger; + input_capacity = new_cap; + } + + tidesdb_sstable_ref(sst); + inputs[input_count++] = sst; + if (lv < min_input_level) min_input_level = lv; + if (lv > max_input_level) max_input_level = lv; + } + + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + } + + if (input_count == 0) + { + free(inputs); + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' no sstables overlap requested range", cf->name); + return TDB_SUCCESS; + } + + /* merge into the largest level affected so any tombstones in the range that + * meet their dead puts get a shot at dropping when the target is the bottom. + * a caller can override this to force the largest level of the whole cf -- + * regular tombstones only drop there, so steering a dense range down is the + * difference between the tombstones dying and lingering forever. */ + int target_level = max_input_level; + if (target_level_override >= 0 && target_level_override < num_levels) + { + target_level = target_level_override; + if (target_level < min_input_level) target_level = min_input_level; + } + + const int merge_result = tidesdb_targeted_merge(cf, inputs, input_count, min_input_level, + max_input_level, target_level); + free(inputs); + + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return merge_result; +} + +int tidesdb_compact_range(tidesdb_column_family_t *cf, const uint8_t *start_key, + size_t start_key_size, const uint8_t *end_key, size_t end_key_size) +{ + /* public api keeps the default behavior -- merge into max_input_level */ + return tidesdb_compact_range_internal(cf, start_key, start_key_size, end_key, end_key_size, -1); +} + +/** + * tidesdb_apply_backpressure + * checks L0 queue and L1 file count and applies coordinated backpressure + * implements stall mechanism when L0 queue exceeds threshold (blocking flush) + * @param cf the column family + * @return TDB_SUCCESS or error code + */ +static int tidesdb_apply_backpressure(tidesdb_column_family_t *cf) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + /* L0 depth -- in unified mode every write lands in the shared unified + * memtable, so the per-CF immutable queue stays empty and the unified + * immutable queue is the one to watch */ + queue_t *l0_queue = (cf->db && cf->db->unified_mt.enabled && cf->db->unified_mt.immutables) + ? cf->db->unified_mt.immutables + : cf->immutable_memtables; + const size_t l0_queue_depth = queue_size(l0_queue); + + /* we check L1 file count */ + int l1_file_count = atomic_load_explicit(&cf->levels[0]->num_sstables, memory_order_acquire); + + const size_t effective_stall = tdb_cf_effective_stall(cf); + const int effective_l1_trigger = tdb_cf_effective_l1_trigger(cf); + + /** l0 queue exceeds threshold -- force blocking flush of all immutables + * this prevents unbounded memory growth when flush worker falls behind */ + int l0_delayed = 0; /* track if L0/L1 already applied a delay */ + if (l0_queue_depth >= effective_stall) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' L0 queue stall triggered %zu immutables (effective_threshold=%zu, " + "configured=%d) - blocking until flushes complete", + cf->name, l0_queue_depth, effective_stall, + cf->config.l0_queue_stall_threshold); + + /** flow-control wait in which we block while the flush worker drains the queue below the + * threshold. we keep waiting as long as progress is happening -- either the + * queue depth shrinks or the global flush heartbeat advances (a worker is + * actively flushing). we only give up after TDB_BACKPRESSURE_STALL_MAX_ITERATIONS + * consecutive polls with zero progress, which means the flush engine is genuinely + * wedged rather than merely slow. a healthy but saturated system simply paces the + * writer here instead of failing the commit. */ + int total_iterations = 0; + int no_progress = 0; + size_t best_depth = queue_size(l0_queue); + uint64_t last_heartbeat = + atomic_load_explicit(&cf->db->flush_heartbeat, memory_order_relaxed); + while (queue_size(l0_queue) >= effective_stall) + { + usleep(TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US); + total_iterations++; + + const size_t cur_depth = queue_size(l0_queue); + const uint64_t cur_heartbeat = + atomic_load_explicit(&cf->db->flush_heartbeat, memory_order_relaxed); + + if (cur_depth < best_depth || cur_heartbeat != last_heartbeat) + { + /* queue prog is draining or a flush worker is actively working */ + best_depth = cur_depth; + last_heartbeat = cur_heartbeat; + no_progress = 0; + } + else if (++no_progress >= TDB_BACKPRESSURE_STALL_MAX_ITERATIONS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' L0 queue stall, no flush progress for %dms - " + "flush engine appears wedged", + cf->name, + no_progress * (TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US / 1000)); + return TDB_ERR_BUSY; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' L0 queue stall resolved after %dms", cf->name, + total_iterations * (TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US / 1000)); + l0_delayed = 1; + } + + /* L1 file count does NOT gate writes. compaction (L1->L2+) is serialized per CF and is + * structurally slower than flush inflow, so L1 settles at a workload-dependent count; blocking + * the write/flush pipeline on it only converts a compaction-throughput limit into a stop-start + * stall and starves flushing (which is independent of compaction). the L1 graduated *delays* + * below pace writes gently without stopping them, memtable memory is bounded by the L0 queue + * stall and the active-memtable ceiling, and the open-fd working set is bounded by the reader + * fd reserve plus the reaper -- none of which need L1 file count as a write gate. */ + + /* per-cf active memtable ceiling. tidesdb_flush_memtable_internal silently + * defers the rotate when the active_flushes slot cap is reached, so the L0 + * queue stall and L1 file-count delays above are not enough to bound the + * active memtable when writes outpace the flush slots. stall the writer + * here when the active exceeds ACTIVE_MT_CEILING_MULT x write_buffer_size + * until rotation completes. unified mode uses its own branch below */ + if (cf->db && !cf->db->unified_mt.enabled && cf->config.write_buffer_size > 0) + { + const size_t ceiling = + TDB_BACKPRESSURE_ACTIVE_MT_CEILING_MULT * cf->config.write_buffer_size; + size_t active_size = 0; + tidesdb_memtable_t *amt = NULL; + if (tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, &amt)) + { + if (amt->skip_list) active_size = (size_t)skip_list_get_size(amt->skip_list); + tidesdb_immutable_memtable_unref(amt); + } + if (active_size >= ceiling) + { + if (tdb_log_throttle(cf->db, &cf->last_ceiling_stall_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' active memtable ceiling stall %zu bytes >= %zu (%dx wbuf)", + cf->name, active_size, ceiling, + TDB_BACKPRESSURE_ACTIVE_MT_CEILING_MULT); + + /* kick a force-flush so rotation runs as soon as a slot frees, instead + * of waiting for the reaper's deferred-flush retry cycle. if the slot + * cap is hit this returns SUCCESS after setting flush_deferred=1 */ + if (!atomic_load_explicit(&cf->is_flushing, memory_order_relaxed)) + tidesdb_flush_memtable_internal(cf, 0, 1); + + int total_iterations = 0; + int no_progress = 0; + size_t best_size = active_size; + uint64_t last_heartbeat = + atomic_load_explicit(&cf->db->flush_heartbeat, memory_order_relaxed); + while (1) + { + size_t cur_size = 0; + tidesdb_memtable_t *cur_amt = NULL; + if (tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, + &cur_amt)) + { + if (cur_amt->skip_list) + cur_size = (size_t)skip_list_get_size(cur_amt->skip_list); + tidesdb_immutable_memtable_unref(cur_amt); + } + if (cur_size < ceiling) break; + + usleep(TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US); + total_iterations++; + + const uint64_t cur_heartbeat = + atomic_load_explicit(&cf->db->flush_heartbeat, memory_order_relaxed); + if (cur_size < best_size || cur_heartbeat != last_heartbeat) + { + best_size = cur_size; + last_heartbeat = cur_heartbeat; + no_progress = 0; + } + else if (++no_progress >= TDB_BACKPRESSURE_STALL_MAX_ITERATIONS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' active memtable ceiling stall, no rotate progress for " + "%dms - flush engine appears wedged", + cf->name, + no_progress * (TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US / 1000)); + return TDB_ERR_BUSY; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' active memtable ceiling stall resolved after %dms", + cf->name, + total_iterations * (TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US / 1000)); + l0_delayed = 1; + } + } + + /* unified active memtable ceiling. tidesdb_unified_memtable_rotate runs + * under a single-rotator CAS on unified_mt.is_flushing -- every writer + * that loses the CAS skips the rotate and proceeds, so a burst of writers + * crossing the threshold simultaneously can pile data into the active + * before the winner publishes the new one. same shape as the per-cf + * stall above but rotation is kicked through the same CAS+rotate path + * tidesdb_txn_commit uses, not through flush_memtable_internal */ + if (cf->db && cf->db->unified_mt.enabled && cf->db->unified_mt.write_buffer_size > 0) + { + const size_t u_ceiling = + TDB_BACKPRESSURE_ACTIVE_MT_CEILING_MULT * cf->db->unified_mt.write_buffer_size; + size_t u_size = 0; + tidesdb_memtable_t *umt = NULL; + if (tidesdb_active_memtable_try_ref(&cf->db->unified_mt.active_mt_readers, + &cf->db->unified_mt.active, &umt)) + { + if (umt->skip_list) u_size = (size_t)skip_list_get_size(umt->skip_list); + tidesdb_immutable_memtable_unref(umt); + } + if (u_size >= u_ceiling) + { + if (tdb_log_throttle(cf->db, &cf->db->unified_mt.last_ceiling_stall_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Unified active memtable ceiling stall %zu bytes >= %zu (%dx wbuf)", + u_size, u_ceiling, TDB_BACKPRESSURE_ACTIVE_MT_CEILING_MULT); + + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&cf->db->unified_mt.is_flushing, &expected, + 1, memory_order_acquire, + memory_order_relaxed)) + { + tidesdb_unified_memtable_rotate(cf->db); + atomic_store_explicit(&cf->db->unified_mt.is_flushing, 0, memory_order_release); + } + + int total_iterations = 0; + int no_progress = 0; + size_t best_size = u_size; + uint64_t last_heartbeat = + atomic_load_explicit(&cf->db->flush_heartbeat, memory_order_relaxed); + while (1) + { + size_t cur_size = 0; + tidesdb_memtable_t *cur_umt = NULL; + if (tidesdb_active_memtable_try_ref(&cf->db->unified_mt.active_mt_readers, + &cf->db->unified_mt.active, &cur_umt)) + { + if (cur_umt->skip_list) + cur_size = (size_t)skip_list_get_size(cur_umt->skip_list); + tidesdb_immutable_memtable_unref(cur_umt); + } + if (cur_size < u_ceiling) break; + + usleep(TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US); + total_iterations++; + + const uint64_t cur_heartbeat = + atomic_load_explicit(&cf->db->flush_heartbeat, memory_order_relaxed); + if (cur_size < best_size || cur_heartbeat != last_heartbeat) + { + best_size = cur_size; + last_heartbeat = cur_heartbeat; + no_progress = 0; + } + else if (++no_progress >= TDB_BACKPRESSURE_STALL_MAX_ITERATIONS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "unified active memtable ceiling stall: no rotate progress for " + "%dms - flush engine appears wedged", + no_progress * (TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US / 1000)); + return TDB_ERR_BUSY; + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified active memtable ceiling stall resolved after %dms", + total_iterations * (TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US / 1000)); + l0_delayed = 1; + } + } + + /* L0/L1 graduated delays. skip if any stall above already paced this + * commit */ + if (!l0_delayed) + { + if (l0_queue_depth >= + (size_t)((double)effective_stall * TDB_BACKPRESSURE_HIGH_THRESHOLD_RATIO) || + l1_file_count >= (effective_l1_trigger * TDB_BACKPRESSURE_L1_HIGH_MULTIPLIER)) + { + /** high pressure -- TDB_BACKPRESSURE_HIGH_THRESHOLD_RATIO of stall threshold or + * TDB_BACKPRESSURE_L1_HIGH_MULTIPLIER x effective L1 trigger */ + usleep(TDB_BACKPRESSURE_HIGH_DELAY_US); + if (tdb_log_throttle(cf->db, &cf->last_backpressure_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' high backpressure L0=%zu L1=%d - %dus delay", + cf->name, l0_queue_depth, l1_file_count, + TDB_BACKPRESSURE_HIGH_DELAY_US); + l0_delayed = 1; + } + else if (l0_queue_depth >= (size_t)((double)effective_stall * + TDB_BACKPRESSURE_MODERATE_THRESHOLD_RATIO) || + l1_file_count >= (effective_l1_trigger * TDB_BACKPRESSURE_L1_MODERATE_MULTIPLIER)) + { + /** moderate pressure -- TDB_BACKPRESSURE_MODERATE_THRESHOLD_RATIO of stall threshold or + * TDB_BACKPRESSURE_L1_MODERATE_MULTIPLIER x effective L1 trigger */ + usleep(TDB_BACKPRESSURE_MODERATE_DELAY_US); + if (tdb_log_throttle(cf->db, &cf->last_backpressure_log_sec, + TDB_BACKPRESSURE_STALL_LOG_INTERVAL_SEC)) + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' moderate backpressure L0=%zu L1=%d - %dus delay", cf->name, + l0_queue_depth, l1_file_count, TDB_BACKPRESSURE_MODERATE_DELAY_US); + l0_delayed = 1; + } + } + + /**** global memory pressure (computed by reaper every Nms, single atomic_load) + *** critical blocking and self-help flushes always fire regardless of L0 delay. + ** high/elevated delays are skipped if L0/L1 already applied a delay to avoid + * double-sleeping on the same commit (the L0 delay already throttled ingestion). */ + if (cf->db) + { + int pressure = atomic_load_explicit(&cf->db->memory_pressure_level, memory_order_relaxed); + if (pressure >= TDB_MEMORY_PRESSURE_CRITICAL) + { + /* critical -- self-help flush before blocking if this CF isnt already flushing */ + if (!atomic_load_explicit(&cf->is_flushing, memory_order_relaxed)) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' global memory pressure CRITICAL - self-flush before stall", + cf->name); + tidesdb_flush_memtable_internal(cf, 0, 1); + } + + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' global memory pressure CRITICAL - blocking writes", + cf->name); + int wait = 0; + while (atomic_load_explicit(&cf->db->memory_pressure_level, memory_order_relaxed) >= + TDB_MEMORY_PRESSURE_CRITICAL) + { + usleep(TDB_BACKPRESSURE_STALL_CHECK_INTERVAL_US); + if (++wait >= TDB_BACKPRESSURE_STALL_MAX_ITERATIONS) + { + TDB_DEBUG_LOG( + TDB_LOG_ERROR, + "CF '%s' global memory pressure stall timeout after %d iterations", + cf->name, wait); + return TDB_ERR_BUSY; + } + } + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' global memory pressure stall resolved after %d iterations", + cf->name, wait); + } + else if (pressure >= TDB_MEMORY_PRESSURE_HIGH) + { + /* high -- we force flush this CF; skip delay if L0 already throttled */ + tidesdb_flush_memtable_internal(cf, 0, 1); + if (!l0_delayed) usleep(TDB_BACKPRESSURE_HIGH_DELAY_US); + } + else if (pressure >= TDB_MEMORY_PRESSURE_ELEVATED) + { + /* elevated -- proactive flush + tiny yield (skip yield if L0 already throttled) */ + if (!atomic_load_explicit(&cf->is_flushing, memory_order_relaxed)) + tidesdb_flush_memtable_internal(cf, 0, 0); + if (!l0_delayed) usleep(TDB_BACKPRESSURE_ELEVATED_DELAY_US); + } + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_add_cf_internal + * internal helper to add a CF to transaction and take snapshot + * @param txn the transaction + * @param cf the column family + */ +static int tidesdb_txn_add_cf_internal(tidesdb_txn_t *txn, tidesdb_column_family_t *cf); + +/** + * tidesdb_txn_remove_from_active_list + * internal helper to remove a SERIALIZABLE transaction from the active list + * @param txn the transaction to remove + */ +/** + * tidesdb_min_active_snapshot_seq + * scans active_txns for the smallest snapshot_seq still in use. compaction uses + * this to decide whether an older same-key version is still needed by some + * reader. returns UINT64_MAX when no snapshot-fixed txn is active, which means + * compaction may drop any superseded version. + */ +static uint64_t tidesdb_min_active_snapshot_seq(tidesdb_t *db) +{ + if (!db) return UINT64_MAX; + + uint64_t min_seq = UINT64_MAX; + pthread_rwlock_rdlock(&db->active_txns_lock); + const int count = db->num_active_txns; + tidesdb_txn_t **active = db->active_txns; + for (int i = 0; i < count; i++) + { + tidesdb_txn_t *other = active[i]; + if (!other || other->is_committed || other->is_aborted) continue; + if (other->isolation_level < TDB_ISOLATION_REPEATABLE_READ) continue; + if (other->snapshot_seq < min_seq) min_seq = other->snapshot_seq; + } + pthread_rwlock_unlock(&db->active_txns_lock); + return min_seq; +} + +static void tidesdb_txn_remove_from_active_list(tidesdb_txn_t *txn) +{ + if (!txn || !txn->db) return; + if (txn->isolation_level < TDB_ISOLATION_REPEATABLE_READ) return; + + pthread_rwlock_wrlock(&txn->db->active_txns_lock); + for (int i = 0; i < txn->db->num_active_txns; i++) + { + if (txn->db->active_txns[i] == txn) + { + /* the list is scanned as an unordered set, so swap the last entry into + * this slot for O(1) removal instead of shifting the tail down */ + txn->db->active_txns[i] = txn->db->active_txns[txn->db->num_active_txns - 1]; + txn->db->num_active_txns--; + break; + } + } + pthread_rwlock_unlock(&txn->db->active_txns_lock); +} + +/** + * tidesdb_txn_add_to_read_set + * internal helper to add a key to the read set for conflict detection + * @param txn the transaction + * @param cf the column family + * @param key the key + * @param key_size the key size + * @param seq the sequence number + * @return 0 on success, -1 on failure + */ +static int tidesdb_txn_add_to_read_set(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, + const uint8_t *key, const size_t key_size, + const uint64_t seq) +{ + /*** we skip read tracking for isolation levels that dont need conflict detection + ** SNAPSHOT only needs write-write conflict detection (no read set tracking) + * only REPEATABLE_READ and SERIALIZABLE need read tracking */ + if (txn->isolation_level != TDB_ISOLATION_REPEATABLE_READ && + txn->isolation_level != TDB_ISOLATION_SERIALIZABLE) + { + return 0; + } + + /** we check last few entries first (hot cache, likely duplicates) + * most iterators read sequentially, so recent keys are often duplicates */ + const int check_recent = (txn->read_set_count < 8) ? txn->read_set_count : 8; + for (int i = txn->read_set_count - 1; i >= txn->read_set_count - check_recent; i--) + { + if (txn->read_cfs[i] == cf && txn->read_key_sizes[i] == key_size && + memcmp(txn->read_keys[i], key, key_size) == 0) + { + /* already in read set, we update sequence if newer */ + if (seq > txn->read_seqs[i]) + { + txn->read_seqs[i] = seq; + } + return 0; + } + } + + if (txn->read_set_count >= txn->read_set_capacity) + { + int new_cap = txn->read_set_capacity * 2; + if (new_cap < txn->read_set_capacity + TDB_TXN_READ_SET_BATCH_GROW) + { + new_cap = txn->read_set_capacity + TDB_TXN_READ_SET_BATCH_GROW; + } + + uint8_t **new_keys = realloc(txn->read_keys, new_cap * sizeof(uint8_t *)); + if (!new_keys) return -1; + + size_t *new_sizes = realloc(txn->read_key_sizes, new_cap * sizeof(size_t)); + if (!new_sizes) + { + /* new_keys succeeded, so we need to keep it */ + txn->read_keys = new_keys; + return -1; + } + + uint64_t *new_seqs = realloc(txn->read_seqs, new_cap * sizeof(uint64_t)); + if (!new_seqs) + { + txn->read_keys = new_keys; + txn->read_key_sizes = new_sizes; + return -1; + } + + tidesdb_column_family_t **new_cfs = + realloc(txn->read_cfs, new_cap * sizeof(tidesdb_column_family_t *)); + if (!new_cfs) + { + txn->read_keys = new_keys; + txn->read_key_sizes = new_sizes; + txn->read_seqs = new_seqs; + return -1; + } + + txn->read_keys = new_keys; + txn->read_key_sizes = new_sizes; + txn->read_seqs = new_seqs; + txn->read_cfs = new_cfs; + txn->read_set_capacity = new_cap; + } + + /* we utilize arena allocation for read keys to reduce malloc overhead */ + uint8_t *key_ptr = NULL; + + /* we check if current arena has space */ + if (txn->read_key_arenas && txn->read_key_arena_count > 0) + { + const size_t remaining = TDB_TXN_READ_KEY_ARENA_SIZE - txn->read_key_arena_used; + if (key_size <= remaining) + { + /* bump allocate from current arena */ + key_ptr = + txn->read_key_arenas[txn->read_key_arena_count - 1] + txn->read_key_arena_used; + txn->read_key_arena_used += key_size; + } + } + + /* we need new arena or first allocation */ + if (!key_ptr) + { + const size_t arena_size = + (key_size > TDB_TXN_READ_KEY_ARENA_SIZE) ? key_size : TDB_TXN_READ_KEY_ARENA_SIZE; + uint8_t *new_arena = malloc(arena_size); + if (!new_arena) return -1; + + /* we grow arena array if needed */ + if (!txn->read_key_arenas) + { + txn->read_key_arenas = + malloc(TDB_TXN_READ_KEY_ARENA_INITIAL_CAPACITY * sizeof(uint8_t *)); + if (!txn->read_key_arenas) + { + free(new_arena); + return -1; + } + } + else if ((txn->read_key_arena_count & (txn->read_key_arena_count - 1)) == 0 && + txn->read_key_arena_count >= TDB_TXN_READ_KEY_ARENA_INITIAL_CAPACITY) + { + /* power of 2 and >= initial capacity, double the array */ + const int new_cap = txn->read_key_arena_count * 2; + uint8_t **new_arenas = realloc(txn->read_key_arenas, new_cap * sizeof(uint8_t *)); + if (!new_arenas) + { + free(new_arena); + return -1; + } + txn->read_key_arenas = new_arenas; + } + + txn->read_key_arenas[txn->read_key_arena_count++] = new_arena; + key_ptr = new_arena; + txn->read_key_arena_used = key_size; + + /* account the newly allocated read-key arena (amortized per arena, off the per-read path) + */ + txn->mem_bytes += (int64_t)arena_size; + tidesdb_txn_mem_publish(txn); + } + + memcpy(key_ptr, key, key_size); + txn->read_keys[txn->read_set_count] = key_ptr; + txn->read_key_sizes[txn->read_set_count] = key_size; + txn->read_seqs[txn->read_set_count] = seq; + txn->read_cfs[txn->read_set_count] = cf; + + txn->read_set_count++; + if (txn->read_set_count == TDB_TXN_READ_HASH_THRESHOLD && !txn->read_set_hash) + { + txn->read_set_hash = tidesdb_read_set_hash_create(); + if (txn->read_set_hash) + { + /* we populate hash with all existing reads */ + for (int i = 0; i < txn->read_set_count; i++) + { + tidesdb_read_set_hash_insert((tidesdb_read_set_hash_t *)txn->read_set_hash, txn, i); + } + } + } + else if (txn->read_set_hash) + { + /* we add new read to existing hash */ + tidesdb_read_set_hash_insert((tidesdb_read_set_hash_t *)txn->read_set_hash, txn, + txn->read_set_count - 1); + } + + return 0; +} + +/** + * tidesdb_txn_begin + * begins a new transaction with default isolation level (READ_COMMITTED) + * @param db database handle + * @param txn output transaction handle + * @return TDB_SUCCESS or error code + */ +int tidesdb_txn_begin(tidesdb_t *db, tidesdb_txn_t **txn) +{ + return tidesdb_txn_begin_with_isolation(db, TDB_ISOLATION_READ_COMMITTED, txn); +} + +/** + * tidesdb_txn_begin_with_isolation + * begins a new transaction with specified isolation level + * + * isolation levels + * -- READ_UNCOMMITTED sees all versions including uncommitted (dirty reads allowed) + * -- READ_COMMITTED refreshes snapshot on each read (prevents dirty reads) + * -- REPEATABLE_READ consistent snapshot, read-write conflict detection + * -- SNAPSHOT consistent snapshot, write-write conflict detection only + * -- SERIALIZABLE SSI with dangerous structure detection (prevents all anomalies) + * + * @param db database handle + * @param isolation isolation level + * @param txn output transaction handle + * @return TDB_SUCCESS or error code + */ +int tidesdb_txn_begin_with_isolation(tidesdb_t *db, const tidesdb_isolation_level_t isolation, + tidesdb_txn_t **txn) +{ + if (!db || !txn) return TDB_ERR_INVALID_ARGS; + + const int wait_result = wait_for_open(db); + if (wait_result != TDB_SUCCESS) + { + return wait_result; + } + + if (isolation < TDB_ISOLATION_READ_UNCOMMITTED || isolation > TDB_ISOLATION_SERIALIZABLE) + { + return TDB_ERR_INVALID_ARGS; + } + + *txn = calloc(1, sizeof(tidesdb_txn_t)); + if (!*txn) return TDB_ERR_MEMORY; + + (*txn)->db = db; + (*txn)->isolation_level = isolation; + + /* we assign unique transaction id from database counter */ + (*txn)->txn_id = atomic_fetch_add_explicit(&db->next_txn_id, 1, memory_order_relaxed); + + if (isolation == TDB_ISOLATION_READ_UNCOMMITTED) + { + (*txn)->snapshot_seq = UINT64_MAX; /* we see all versions */ + } + else if (isolation == TDB_ISOLATION_READ_COMMITTED) + { + /* we snapshot will be refreshed on each read -- initial value doesnt matter */ + (*txn)->snapshot_seq = 0; + } + else + { + /** REPEATABLE_READ, SNAPSHOT, SERIALIZABLE = consistent snapshot + * we capture global_seq -- 1 to see only transactions committed before we started */ + uint64_t current_seq = atomic_load_explicit(&db->global_seq, memory_order_acquire); + (*txn)->snapshot_seq = (current_seq > 0) ? current_seq - 1 : 0; + } + + (*txn)->commit_seq = 0; + + (*txn)->ops_capacity = TDB_INITIAL_TXN_OPS_CAPACITY; + (*txn)->ops = calloc((*txn)->ops_capacity, sizeof(tidesdb_txn_op_t)); + if (!(*txn)->ops) + { + free(*txn); + *txn = NULL; + return TDB_ERR_MEMORY; + } + + /*** we defer read set allocation for isolation levels that dont need read conflict detection + ** only REPEATABLE_READ and SERIALIZABLE need read tracking + * SNAPSHOT uses write-write conflict detection only (no read set needed) */ + if (isolation == TDB_ISOLATION_REPEATABLE_READ || isolation == TDB_ISOLATION_SERIALIZABLE) + { + (*txn)->read_set_capacity = TDB_INITIAL_TXN_READ_SET_CAPACITY; + (*txn)->read_keys = calloc((*txn)->read_set_capacity, sizeof(uint8_t *)); + (*txn)->read_key_sizes = calloc((*txn)->read_set_capacity, sizeof(size_t)); + (*txn)->read_seqs = calloc((*txn)->read_set_capacity, sizeof(uint64_t)); + (*txn)->read_cfs = calloc((*txn)->read_set_capacity, sizeof(tidesdb_column_family_t *)); + + if (!(*txn)->read_keys || !(*txn)->read_key_sizes || !(*txn)->read_seqs || + !(*txn)->read_cfs) + { + free((*txn)->read_keys); + free((*txn)->read_key_sizes); + free((*txn)->read_seqs); + free((*txn)->read_cfs); + free((*txn)->ops); + free(*txn); + *txn = NULL; + return TDB_ERR_MEMORY; + } + } + else + { + /* low isolation levels dont track reads */ + (*txn)->read_set_capacity = 0; + (*txn)->read_keys = NULL; + (*txn)->read_key_sizes = NULL; + (*txn)->read_seqs = NULL; + (*txn)->read_cfs = NULL; + } + + (*txn)->write_set_hash = NULL; /* hash table created lazily for large transactions */ + (*txn)->read_set_hash = NULL; /* hash table created lazily for large read sets */ + + (*txn)->cf_capacity = TDB_INITIAL_TXN_CF_CAPACITY; + (*txn)->cfs = calloc((*txn)->cf_capacity, sizeof(tidesdb_column_family_t *)); + + if (!(*txn)->cfs) + { + free((*txn)->read_keys); + free((*txn)->read_key_sizes); + free((*txn)->read_seqs); + free((*txn)->read_cfs); + free((*txn)->ops); + free(*txn); + *txn = NULL; + return TDB_ERR_MEMORY; + } + + (*txn)->savepoints_capacity = TDB_INITIAL_TXN_SAVEPOINT_CAPACITY; + (*txn)->savepoint_op_counts = calloc((*txn)->savepoints_capacity, sizeof(int)); + (*txn)->savepoint_cf_counts = calloc((*txn)->savepoints_capacity, sizeof(int)); + (*txn)->savepoint_names = calloc((*txn)->savepoints_capacity, sizeof(char *)); + + if (!(*txn)->savepoint_op_counts || !(*txn)->savepoint_cf_counts || !(*txn)->savepoint_names) + { + free((*txn)->savepoint_op_counts); + free((*txn)->savepoint_cf_counts); + free((*txn)->savepoint_names); + free((*txn)->cfs); + free((*txn)->read_keys); + free((*txn)->read_key_sizes); + free((*txn)->read_seqs); + free((*txn)->read_cfs); + free((*txn)->ops); + free(*txn); + *txn = NULL; + return TDB_ERR_MEMORY; + } + + (*txn)->num_cfs = 0; + + (*txn)->has_rw_conflict_in = 0; + (*txn)->has_rw_conflict_out = 0; + + /* we register snapshot-fixed transactions in active list so SSI can scan them + * (filtered to SERIALIZABLE) and so compaction can read the min snapshot_seq to + * retain older versions still needed by an active reader */ + if (isolation >= TDB_ISOLATION_REPEATABLE_READ) + { + pthread_rwlock_wrlock(&db->active_txns_lock); + + if (db->num_active_txns < db->active_txns_capacity) + { + db->active_txns[db->num_active_txns++] = *txn; + } + else + { + /*** the capacity exceeded, we log warning but continue. + ** this transaction wont participate in SSI conflict detection or + * in the compaction snapshot floor, but it will still see its own + *** consistent snapshot until any compaction drops a needed version. */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Active transaction list full (%d), SSI and snapshot retention may be " + "less effective", + db->active_txns_capacity); + } + + pthread_rwlock_unlock(&db->active_txns_lock); + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_add_cf_internal + * internal helper to add a CF to transaction and take snapshot + * @param txn + * @param cf + * @return error code + */ +static int tidesdb_txn_add_cf_internal(tidesdb_txn_t *txn, tidesdb_column_family_t *cf) +{ + if (!txn || !cf) return -1; + if (txn->is_committed || txn->is_aborted) return -1; + + /* we check last-used CF (covers single-CF workloads in O(1)) */ + if (txn->last_cf == cf) return txn->last_cf_index; + + for (int i = 0; i < txn->num_cfs; i++) + { + if (txn->cfs[i] == cf) + { + txn->last_cf = cf; + txn->last_cf_index = i; + return i; + } + } + + if (txn->num_cfs >= txn->cf_capacity) + { + /* we check if we've hit the maximum column family limit */ + if (txn->cf_capacity >= TDB_MAX_TXN_CFS) + { + return -1; + } + + int new_cap = txn->cf_capacity * 2; + + /* we cap at maximum to prevent overflow */ + if (new_cap > TDB_MAX_TXN_CFS) new_cap = TDB_MAX_TXN_CFS; + + tidesdb_column_family_t **new_cfs = + realloc(txn->cfs, new_cap * sizeof(tidesdb_column_family_t *)); + + if (!new_cfs) return -1; + + for (int i = txn->cf_capacity; i < new_cap; i++) + { + new_cfs[i] = NULL; + } + + txn->cfs = new_cfs; + txn->cf_capacity = new_cap; + } + + const int cf_index = txn->num_cfs; + txn->cfs[cf_index] = cf; + txn->num_cfs++; + + txn->last_cf = cf; + txn->last_cf_index = cf_index; + + return cf_index; +} + +int tidesdb_txn_put(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size, const uint8_t *value, const size_t value_size, + const time_t ttl) +{ + if (!txn || !cf || !key || key_size == 0 || !value) return TDB_ERR_INVALID_ARGS; + + /* we wait for database to finish opening, or fail if shutting down */ + if (!txn->db) return TDB_ERR_INVALID_ARGS; + + if (atomic_load_explicit(&txn->db->replica_mode, memory_order_relaxed)) return TDB_ERR_READONLY; + + /* we validate key-value size against memory limits */ + const int size_check = tidesdb_validate_kv_size(txn->db, key_size, value_size); + if (size_check != 0) return size_check; + if (txn->is_committed || txn->is_aborted) return TDB_ERR_INVALID_ARGS; + + /* we add CF to transaction if not already added */ + const int cf_index = tidesdb_txn_add_cf_internal(txn, cf); + if (cf_index < 0) return TDB_ERR_MEMORY; + + if (txn->num_ops >= TDB_MAX_TXN_OPS) + { + return TDB_ERR_TOO_LARGE; + } + + if (txn->num_ops >= txn->ops_capacity) + { + int new_capacity = txn->ops_capacity * 2; + + /* we ensure we dont exceed max even with doubling */ + if (new_capacity > TDB_MAX_TXN_OPS) new_capacity = TDB_MAX_TXN_OPS; + + if (new_capacity <= txn->ops_capacity) return TDB_ERR_TOO_LARGE; + + tidesdb_txn_op_t *new_ops = realloc(txn->ops, new_capacity * sizeof(tidesdb_txn_op_t)); + if (!new_ops) return TDB_ERR_MEMORY; + + txn->ops = new_ops; + txn->ops_capacity = new_capacity; + } + + tidesdb_txn_op_t *op = &txn->ops[txn->num_ops]; + memset(op, 0, sizeof(tidesdb_txn_op_t)); + + /*** we coalesce key+value into a single allocation to halve malloc pressure + ** op->value points into the same buffer at offset key_size + * only op->key should be freed (it owns the entire buffer) */ + const size_t kv_alloc_size = key_size + ((value && value_size > 0) ? value_size : 0); + op->key = malloc(kv_alloc_size); + if (!op->key) return TDB_ERR_MEMORY; + memcpy(op->key, key, key_size); + op->key_size = key_size; + + if (value && value_size > 0) + { + op->value = op->key + key_size; + memcpy(op->value, value, value_size); + op->value_size = value_size; + } + else + { + op->value = NULL; + op->value_size = 0; + } + + op->ttl = ttl; + op->is_delete = 0; + op->cf = cf; + + txn->num_ops++; + + /* account this op's coalesced key+value buffer (threshold-batched, off the hot path) */ + txn->mem_bytes += (int64_t)(op->key_size + op->value_size); + tidesdb_txn_mem_publish(txn); + + if (txn->num_ops == TDB_TXN_WRITE_HASH_THRESHOLD && !txn->write_set_hash) + { + txn->write_set_hash = tidesdb_write_set_hash_create(); + if (txn->write_set_hash) + { + /* we populate hash with all existing operations */ + for (int i = 0; i < txn->num_ops; i++) + { + tidesdb_write_set_hash_insert((tidesdb_write_set_hash_t *)txn->write_set_hash, txn, + i); + } + } + } + else if (txn->write_set_hash) + { + tidesdb_write_set_hash_insert((tidesdb_write_set_hash_t *)txn->write_set_hash, txn, + txn->num_ops - 1); + } + + return TDB_SUCCESS; +} + +int tidesdb_txn_get(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size, uint8_t **value, size_t *value_size) +{ + if (!txn || !cf || !key || key_size == 0 || !value || !value_size) return TDB_ERR_INVALID_ARGS; + + PROFILE_INC(txn->db, total_reads); + + /* we wait for database to finish opening, or fail if shutting down */ + if (!txn->db) return TDB_ERR_INVALID_ARGS; + + /* we add CF to transaction if not already added */ + const int cf_index = tidesdb_txn_add_cf_internal(txn, cf); + if (cf_index < 0) return TDB_ERR_MEMORY; + + /* we check write set first (read your own writes) + * transaction must see its own uncommitted changes before checking cache/memtable + * we use search strategy based on transaction size: + * -- small txns linear scan from end (cache-friendly, low overhead) + * -- medium txns linear scan with early termination per CF + * -- large txns O(1) hash table lookup + * + * we search in reverse order (newest first) to find most recent write */ + + /* for large transactions, we use hash table for O(1) lookup */ + if (txn->write_set_hash) + { + const int op_index = tidesdb_write_set_hash_lookup( + (tidesdb_write_set_hash_t *)txn->write_set_hash, txn, cf, key, key_size); + + if (op_index >= 0) + { + tidesdb_txn_op_t *op = &txn->ops[op_index]; + if (op->is_delete) + { + return TDB_ERR_NOT_FOUND; + } + *value = malloc(op->value_size); + if (!*value) return TDB_ERR_MEMORY; + memcpy(*value, op->value, op->value_size); + *value_size = op->value_size; + return TDB_SUCCESS; + } + /* not in write set, fall through to memtable search */ + } + else + { + /** for small transactions, scan last N ops only + * this handles 99% of cases with minimal overhead */ + const int scan_start = txn->num_ops - 1; + const int scan_end = (txn->num_ops > TDB_TXN_SMALL_SCAN_LIMIT) + ? (txn->num_ops - TDB_TXN_SMALL_SCAN_LIMIT) + : 0; + + for (int i = scan_start; i >= scan_end; i--) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + + /* we do a quick CF check first (pointer comparison) */ + if (op->cf != cf) continue; + + /* then size check (cheap integer comparison) */ + if (op->key_size != key_size) continue; + + /* finally memcmp (most expensive) */ + if (memcmp(op->key, key, key_size) == 0) + { + if (op->is_delete) + { + return TDB_ERR_NOT_FOUND; + } + *value = malloc(op->value_size); + if (!*value) return TDB_ERR_MEMORY; + memcpy(*value, op->value, op->value_size); + *value_size = op->value_size; + return TDB_SUCCESS; + } + } + + /* if transaction is large and we didnt find in recent ops, we scan remainder */ + if (scan_end > 0) + { + for (int i = scan_end - 1; i >= 0; i--) + { + tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + if (op->key_size != key_size) continue; + if (memcmp(op->key, key, key_size) == 0) + { + if (op->is_delete) return TDB_ERR_NOT_FOUND; + *value = malloc(op->value_size); + if (!*value) return TDB_ERR_MEMORY; + memcpy(*value, op->value, op->value_size); + *value_size = op->value_size; + return TDB_SUCCESS; + } + } + } + } + + /* we determine snapshot based on isolation level + * -- READ_UNCOMMITTED UINT64_MAX (see all versions, no visibility + * check) + * -- READ_COMMITTED refresh snapshot on each read (latest committed + * data) + * -- REPEATABLE_READ/SNAPSHOT/SERIALIZABLE we use consistent snapshot from BEGIN */ + uint64_t snapshot_seq; + skip_list_visibility_check_fn visibility_check; + + if (txn->isolation_level == TDB_ISOLATION_READ_UNCOMMITTED) + { + snapshot_seq = UINT64_MAX; + visibility_check = NULL; /* no visibility check -- see everything */ + } + else if (txn->isolation_level == TDB_ISOLATION_READ_COMMITTED) + { + /* we refresh snapshot to see latest committed data + * READ_COMMITTED doesnt need visibility callback because: + * 1. it refreshes snapshot on each read to see all data up to current global_seq + * 2. commit status buffer is circular and can have stale entries after recovery + * 3. any data in memtable with seq <= snapshot_seq is considered visible + * + * we use current_seq (not current_seq - 1) because committed transactions have + * seq <= global_seq. After recovery, global_seq is set to max_seq from ssts, + * so we need snapshot_seq = global_seq to see all committed data. */ + uint64_t current_seq = atomic_load_explicit(&txn->db->global_seq, memory_order_acquire); + snapshot_seq = current_seq; + visibility_check = NULL; /* no visibility check needed for READ_COMMITTED */ + } + else + { + /* REPEATABLE_READ, SNAPSHOT, SERIALIZABLE = consistent snapshot */ + snapshot_seq = txn->snapshot_seq; + visibility_check = tidesdb_visibility_check_callback; + } + + /** we cache current time once for consistent TTL checks throughout this read. + * declared here so both the unified goto path and the normal path see it. */ + const int64_t now = (int64_t)atomic_load(&txn->db->cached_current_time); + + /* unified memtable read pat, we search shared skip list with prefixed key */ + if (txn->db->unified_mt.enabled) + { + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + key_size; + TDB_PREFIXED_KEY_ALLOC(prefixed_key, pk_total, _pk_stack1); + if (!prefixed_key) return TDB_ERR_MEMORY; + size_t pk_size = tdb_build_prefixed_key(cf->unified_cf_index, key, key_size, prefixed_key); + + int unified_rc = TDB_ERR_NOT_FOUND; + const int64_t now_u = (int64_t)atomic_load(&txn->db->cached_current_time); + const uint8_t *temp_val; + size_t temp_val_size; + int64_t ttl_u; + uint8_t deleted_u; + uint64_t found_seq_u = 0; + + /* we search unified active memtable */ + tidesdb_memtable_t *umt = NULL; + int umt_refed = tidesdb_active_memtable_try_ref(&txn->db->unified_mt.active_mt_readers, + &txn->db->unified_mt.active, &umt); + if (umt_refed) + { + int mr = skip_list_get_with_seq_ref(umt->skip_list, prefixed_key, pk_size, &temp_val, + &temp_val_size, &ttl_u, &deleted_u, &found_seq_u, + snapshot_seq, visibility_check, + visibility_check ? txn->db->commit_status : NULL); + if (mr == 0) + { + if (deleted_u) + { + tidesdb_immutable_memtable_unref(umt); + unified_rc = TDB_ERR_NOT_FOUND; + goto unified_memtable_done; + } + if (ttl_u <= 0 || ttl_u > now_u) + { + *value = malloc(temp_val_size); + if (!*value) + { + tidesdb_immutable_memtable_unref(umt); + unified_rc = TDB_ERR_MEMORY; + goto unified_memtable_done; + } + memcpy(*value, temp_val, temp_val_size); + *value_size = temp_val_size; + tidesdb_immutable_memtable_unref(umt); + PROFILE_INC(txn->db, memtable_hits); + tidesdb_txn_add_to_read_set(txn, cf, key, key_size, found_seq_u); + unified_rc = TDB_SUCCESS; + goto unified_memtable_done; + } + tidesdb_immutable_memtable_unref(umt); + unified_rc = TDB_ERR_NOT_FOUND; + goto unified_memtable_done; + } + tidesdb_immutable_memtable_unref(umt); + } + + /**** we search unified immutable memtables (newest first). + *** we snapshot pointers under a single rwlock acquisition and pin each + ** immutable with a refcount so a concurrent flush-worker eviction + * cannot free one out from under the scan. */ + queue_t *uimm_q = txn->db->unified_mt.immutables; + if (uimm_q) + { + const size_t uimm_count = atomic_load_explicit(&uimm_q->size, memory_order_relaxed); + if (uimm_count > 0) + { + tidesdb_memtable_t *uimm_stack[TDB_STACK_IMM_SNAPSHOT]; + tidesdb_memtable_t **uimm_ptrs = uimm_stack; + if (uimm_count > TDB_STACK_IMM_SNAPSHOT) + { + uimm_ptrs = malloc(uimm_count * sizeof(tidesdb_memtable_t *)); + if (!uimm_ptrs) uimm_ptrs = uimm_stack; + } + + /* we pin each immutable under the queue read lock -- queue_remove_if + * holds the matching write lock, so every entry we see is still + * live and try_ref keeps it alive past the unlock */ + size_t snap_count = 0; + pthread_rwlock_rdlock(&uimm_q->read_lock); + { + queue_node_t *cur = uimm_q->head->next; + size_t max = (uimm_ptrs == uimm_stack) ? TDB_STACK_IMM_SNAPSHOT : uimm_count; + for (size_t i = 0; i < max && cur != NULL; i++, cur = cur->next) + { + tidesdb_memtable_t *imm_mt = (tidesdb_memtable_t *)cur->data; + uimm_ptrs[snap_count++] = tidesdb_memtable_try_ref(imm_mt) ? imm_mt : NULL; + } + } + pthread_rwlock_unlock(&uimm_q->read_lock); + + /* we search the pinned snapshot (newest first) */ + int found = 0; + for (size_t qi = snap_count; qi > 0 && !found; qi--) + { + tidesdb_memtable_t *imm_mt = uimm_ptrs[qi - 1]; + if (!imm_mt || !imm_mt->skip_list) continue; + if (atomic_load_explicit(&imm_mt->flushed, memory_order_acquire)) continue; + + int mr = skip_list_get_with_seq_ref( + imm_mt->skip_list, prefixed_key, pk_size, &temp_val, &temp_val_size, &ttl_u, + &deleted_u, &found_seq_u, snapshot_seq, visibility_check, + visibility_check ? txn->db->commit_status : NULL); + if (mr != 0) continue; + + found = 1; + if (deleted_u) + { + unified_rc = TDB_ERR_NOT_FOUND; + } + else if (ttl_u <= 0 || ttl_u > now_u) + { + *value = malloc(temp_val_size); + if (!*value) + { + unified_rc = TDB_ERR_MEMORY; + } + else + { + memcpy(*value, temp_val, temp_val_size); + *value_size = temp_val_size; + PROFILE_INC(txn->db, immutable_hits); + tidesdb_txn_add_to_read_set(txn, cf, key, key_size, found_seq_u); + unified_rc = TDB_SUCCESS; + } + } + else + { + unified_rc = TDB_ERR_NOT_FOUND; + } + } + + /* we release every pin, then the snapshot array */ + for (size_t i = 0; i < snap_count; i++) + { + if (uimm_ptrs[i]) tidesdb_immutable_memtable_unref(uimm_ptrs[i]); + } + if (uimm_ptrs != uimm_stack) free(uimm_ptrs); + + if (found) goto unified_memtable_done; + } + } + + /* not in unified memtables, we fall through to per-CF sstable search */ + TDB_PREFIXED_KEY_FREE(prefixed_key, _pk_stack1); + goto unified_sst_search; + + unified_memtable_done: + TDB_PREFIXED_KEY_FREE(prefixed_key, _pk_stack1); + return unified_rc; + } + + /**** we now load active memtable with refcount protection + *** skip_list_get_with_seq_ref returns a zero-copy pointer into the arena, + ** so the memtable must stay alive through the memcpy. + * we use CAS-based try_ref to safely handle concurrent rotation+cleanup. + * if try_ref fails the memtable is being freed, we fall through to immutables */ + tidesdb_memtable_t *active_mt_struct = NULL; + int active_mt_refed = tidesdb_active_memtable_try_ref(&cf->active_mt_readers, + &cf->active_memtable, &active_mt_struct); + skip_list_t *active_mt = active_mt_refed ? active_mt_struct->skip_list : NULL; + + atomic_thread_fence(memory_order_acquire); + + const uint8_t *temp_value; + size_t temp_value_size; + int64_t ttl; + uint8_t deleted; + uint64_t found_seq = 0; + + int memtable_result = skip_list_get_with_seq_ref( + active_mt, key, key_size, &temp_value, &temp_value_size, &ttl, &deleted, &found_seq, + snapshot_seq, visibility_check, txn->db->commit_status); + + if (memtable_result == 0) + { + if (deleted) + { + if (active_mt_refed) tidesdb_immutable_memtable_unref(active_mt_struct); + return TDB_ERR_NOT_FOUND; + } + + if (ttl <= 0 || ttl > now) + { + *value = malloc(temp_value_size); + if (*value == NULL) + { + if (active_mt_refed) tidesdb_immutable_memtable_unref(active_mt_struct); + return TDB_ERR_MEMORY; + } + memcpy(*value, temp_value, temp_value_size); + *value_size = temp_value_size; + + if (active_mt_refed) tidesdb_immutable_memtable_unref(active_mt_struct); + + PROFILE_INC(txn->db, memtable_hits); + tidesdb_txn_add_to_read_set(txn, cf, key, key_size, found_seq); + return TDB_SUCCESS; + } + + if (active_mt_refed) tidesdb_immutable_memtable_unref(active_mt_struct); + return TDB_ERR_NOT_FOUND; + } + + /* active memtable ref no longer needed -- value was not found there */ + if (active_mt_refed) tidesdb_immutable_memtable_unref(active_mt_struct); + + /*** we use lock-free snapshot to search immutable memtables + ** acquire holds a reader count on the snapshot slot -- no malloc, no per-item refs + * items are valid while the snapshot slot is held */ + tidesdb_imm_snap_t *imm_snap = tidesdb_imm_snap_acquire(cf); + + if (imm_snap) + { + const size_t immutable_count = atomic_load_explicit(&imm_snap->count, memory_order_acquire); + int result = TDB_ERR_UNKNOWN; + + /* we search in reverse order (newest first) to find most recent version */ + for (int i = (int)immutable_count - 1; i >= 0; i--) + { + const tidesdb_immutable_memtable_t *immutable = + (const tidesdb_immutable_memtable_t *)imm_snap->items[i]; + if (immutable && immutable->skip_list) + { + if (skip_list_get_with_seq_ref( + immutable->skip_list, key, key_size, &temp_value, &temp_value_size, &ttl, + &deleted, &found_seq, snapshot_seq, visibility_check, + visibility_check ? txn->db->commit_status : NULL) == 0) + { + if (deleted) + { + result = TDB_ERR_NOT_FOUND; + break; + } + + if (ttl <= 0 || ttl > now) + { + *value = malloc(temp_value_size); + if (*value == NULL) + { + result = TDB_ERR_MEMORY; + break; + } + memcpy(*value, temp_value, temp_value_size); + *value_size = temp_value_size; + PROFILE_INC(txn->db, immutable_hits); + tidesdb_txn_add_to_read_set(txn, cf, key, key_size, found_seq); + result = TDB_SUCCESS; + break; + } + result = TDB_ERR_NOT_FOUND; + break; + } + } + } + + tidesdb_imm_snap_release(imm_snap); + + if (result != TDB_ERR_UNKNOWN) return result; + } + +unified_sst_search:; + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + for (int level_num = 0; level_num < num_levels; level_num++) + { + int retry_backoff = TDB_SST_RETRY_INITIAL_SPINS; + int level_retries = 0; + retry_level: + PROFILE_INC(txn->db, levels_searched); + tidesdb_level_t *level = cf->levels[level_num]; + + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + tidesdb_sstable_t **sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + int num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + /** we re-load count to detect concurrent remove that swapped array but hasnt updated count + * yet + */ + int num_ssts_recheck = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + if (num_ssts_recheck < num_ssts) + { + num_ssts = num_ssts_recheck; + } + + /* we also verify array hasnt changed (handles add-with-resize race) */ + tidesdb_sstable_t **sstables_check = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (sstables_check != sstables) + { + /* the array was resized, we reload everything */ + sstables = sstables_check; + num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + } + + /* L1+ point reads scan every sstable in the level (bloom-filtered). + * a binary-search "pick one sstable" fast path is unsafe here, it relied + * on level->file_boundaries, which is a compaction scratch field holding + * the NEXT level's min-keys (see tidesdb_level_update_boundaries), not + * this level's own boundaries -- and during a compaction add-then-remove + * window a level transiently holds overlapping sstables, so more than one + * can cover the key. only a full scan keeping the highest seq is correct. */ + + uint64_t best_seq = 0; + uint8_t *best_value = NULL; + size_t best_value_size = 0; + int best_is_dead = 0; + int best_found = 0; + int scan_error = 0; /* set if an sstable could not be opened/read (incomplete scan) */ + + const int scan_start = num_ssts - 1; + const int scan_end = 0; + + for (int j = scan_start; j >= scan_end; j--) + { + tidesdb_sstable_t *sst = sstables[j]; + if (!sst) continue; + + PROFILE_INC(txn->db, sstables_checked); + + /*** we try to take ref for ssts we will check + ** we use try_ref to safely handle concurrent removal -- if refcount is 0, + * the sstable is being freed and we must skip it + ********************************************************************** + *** when try_ref fails, the array may have been swapped with a new one + * containing the merged sstable, so we must retry the entire level */ + if (!tidesdb_sstable_try_ref(sst)) + { + /* we check if array was actually swapped by compaction */ + tidesdb_sstable_t **current_sstables = + atomic_load_explicit(&level->sstables, memory_order_acquire); + + if (current_sstables != sstables) + { + if (level_retries < TDB_SST_RETRY_MAX_LEVEL_RETRIES) + { + /** array was swapped! we retry with fresh array (bounded) + * reset best-match state since old array is gone */ + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + level_retries++; + + if (best_value) + { + free(best_value); + best_value = NULL; + } + best_found = 0; + best_seq = 0; + + for (int b = 0; b < retry_backoff; b++) cpu_pause(); + if (retry_backoff < TDB_SST_RETRY_MAX_SPINS) retry_backoff <<= 1; + + goto retry_level; + } + + /**** retries exhausted but array was swapped. we restart with the + *** current array to avoid using stale sstable pointers. we reset + ** retry counter but only allow one restart to prevent infinite + * loops under pathological compactions */ + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + if (best_value) + { + free(best_value); + best_value = NULL; + } + best_found = 0; + best_seq = 0; + level_retries = TDB_SST_RETRY_MAX_LEVEL_RETRIES - 1; + + goto retry_level; + } + + /* array unchanged but try_ref failed -- the sstable is still live in this level + * (a removal swaps the array first, caught above), so this is a transient reaper + * eviction. it may hold the sole copy of the key; back off retryably, never skip. + */ + scan_error = TDB_ERR_BUSY; + break; + } + + /** we use per-sstable max_seq as upper bound, essentially if the highest seq in this + * sstable cannot beat our current best, skip the expensive lookup */ + if (best_found && sst->max_seq <= best_seq) + { + tidesdb_sstable_unref(cf->db, sst); + continue; + } + + /* reader fd budget -- don't open past the max_open cap; back off with a retryable error + * rather than starving the write path. an already-open sstable is never blocked, and + * the reaper keeps idle sstables below the cap so a point-get normally has headroom + * (see helper). */ + if (!tidesdb_reader_fd_budget_ok(cf->db, sst)) + { + tidesdb_sstable_unref(cf->db, sst); + scan_error = TDB_ERR_BUSY; + break; + } + + tidesdb_kv_pair_t *candidate_kv = NULL; + int get_result = + tidesdb_sstable_get(cf->db, sst, key, key_size, snapshot_seq, &candidate_kv, 0); + + if (get_result == TDB_SUCCESS && candidate_kv) + { + const uint64_t candidate_seq = candidate_kv->entry.seq; + const int accept = + (snapshot_seq == UINT64_MAX) ? 1 : (candidate_seq <= snapshot_seq); + + if (accept && candidate_seq > best_seq) + { + const int is_tombstone = + (candidate_kv->entry.flags & TDB_KV_FLAG_TOMBSTONE) != 0; + const int ttl_ok = + (candidate_kv->entry.ttl <= 0 || candidate_kv->entry.ttl > now); + + if (best_value) + { + free(best_value); + best_value = NULL; + } + + best_seq = candidate_seq; + best_is_dead = is_tombstone || !ttl_ok; + best_found = 1; + + if (!best_is_dead) + { + best_value = malloc(candidate_kv->entry.value_size); + if (best_value) + { + memcpy(best_value, candidate_kv->value, candidate_kv->entry.value_size); + best_value_size = candidate_kv->entry.value_size; + } + } + } + + tidesdb_kv_pair_free(candidate_kv); + } + + tidesdb_sstable_unref(cf->db, sst); + + /* a non-found, non-success return means this sstable could not be opened or read + * (e.g. EMFILE under fd pressure, or an IO error). the scan is therefore incomplete -- + * a newer version of the key may live in the sstable we just failed on -- so we must + * NOT fall through and treat it as "not present" (which would return a stale version or + * a false not-found). surface the error and let the caller retry once fds free. */ + if (get_result != TDB_SUCCESS && get_result != TDB_ERR_NOT_FOUND) + { + scan_error = get_result; + break; + } + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + if (scan_error) + { + if (best_value) free(best_value); + return scan_error; + } + + if (best_found) + { + PROFILE_INC(txn->db, sstable_hits); + + if (!best_is_dead && best_value) + { + *value = best_value; + *value_size = best_value_size; + tidesdb_txn_add_to_read_set(txn, cf, key, key_size, best_seq); + return TDB_SUCCESS; + } + + if (best_value) free(best_value); + return (!best_is_dead) ? TDB_ERR_MEMORY : TDB_ERR_NOT_FOUND; + } + } + + return TDB_ERR_NOT_FOUND; +} + +/** + * tidesdb_txn_delete_internal + * shared implementation for tidesdb_txn_delete and tidesdb_txn_single_delete. + * @param txn transaction handle + * @param cf column family to delete from + * @param key key to delete + * @param key_size size of key + * @param is_single_delete 1 for single-delete semantics, 0 for a regular delete + * @return 0 on success, -n on failure + */ +static int tidesdb_txn_delete_internal(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, + const uint8_t *key, const size_t key_size, + const int is_single_delete) +{ + if (!txn || !cf || !key || key_size == 0) return TDB_ERR_INVALID_ARGS; + + /* we wait for database to finish opening, or fail if shutting down */ + if (!txn->db) return TDB_ERR_INVALID_ARGS; + + if (atomic_load_explicit(&txn->db->replica_mode, memory_order_relaxed)) return TDB_ERR_READONLY; + + if (txn->is_committed || txn->is_aborted) return TDB_ERR_INVALID_ARGS; + + /* we add CF to transaction if not already added */ + const int cf_index = tidesdb_txn_add_cf_internal(txn, cf); + if (cf_index < 0) return TDB_ERR_MEMORY; + + if (txn->num_ops >= TDB_MAX_TXN_OPS) + { + return TDB_ERR_TOO_LARGE; + } + + /* we expand ops array if needed */ + if (txn->num_ops >= txn->ops_capacity) + { + int new_capacity = txn->ops_capacity * 2; + + if (new_capacity > TDB_MAX_TXN_OPS) new_capacity = TDB_MAX_TXN_OPS; + + if (new_capacity <= txn->ops_capacity) return TDB_ERR_TOO_LARGE; + + tidesdb_txn_op_t *new_ops = realloc(txn->ops, new_capacity * sizeof(tidesdb_txn_op_t)); + if (!new_ops) return TDB_ERR_MEMORY; + + txn->ops = new_ops; + txn->ops_capacity = new_capacity; + } + + tidesdb_txn_op_t *op = &txn->ops[txn->num_ops]; + memset(op, 0, sizeof(tidesdb_txn_op_t)); + + op->key = malloc(key_size); + if (!op->key) return TDB_ERR_MEMORY; + memcpy(op->key, key, key_size); + op->key_size = key_size; + + op->value = NULL; + op->value_size = 0; + op->ttl = 0; + op->is_delete = 1; + op->is_single_delete = is_single_delete; + op->cf = cf; + + txn->num_ops++; + + /* account this op's key buffer (value_size is 0 for deletes) */ + txn->mem_bytes += (int64_t)(op->key_size + op->value_size); + tidesdb_txn_mem_publish(txn); + + /* we create hash table when we cross threshold for O(1) lookups */ + if (txn->num_ops == TDB_TXN_WRITE_HASH_THRESHOLD && !txn->write_set_hash) + { + txn->write_set_hash = tidesdb_write_set_hash_create(); + if (txn->write_set_hash) + { + /* we populate hash with all existing operations */ + for (int i = 0; i < txn->num_ops; i++) + { + tidesdb_write_set_hash_insert((tidesdb_write_set_hash_t *)txn->write_set_hash, txn, + i); + } + } + } + else if (txn->write_set_hash) + { + /* we add new operation to existing hash */ + tidesdb_write_set_hash_insert((tidesdb_write_set_hash_t *)txn->write_set_hash, txn, + txn->num_ops - 1); + } + + return TDB_SUCCESS; +} + +int tidesdb_txn_delete(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size) +{ + return tidesdb_txn_delete_internal(txn, cf, key, key_size, 0); +} + +int tidesdb_txn_single_delete(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + const size_t key_size) +{ + return tidesdb_txn_delete_internal(txn, cf, key, key_size, 1); +} + +int tidesdb_txn_rollback(tidesdb_txn_t *txn) +{ + if (!txn || txn->is_committed) return TDB_ERR_INVALID_ARGS; + + /* we remove from active list if SERIALIZABLE */ + tidesdb_txn_remove_from_active_list(txn); + + /* we mark as aborted; operations never applied */ + txn->is_aborted = 1; + return TDB_SUCCESS; +} + +void tidesdb_txn_free(tidesdb_txn_t *txn) +{ + if (!txn) return; + + /* defensive remove in case the caller frees without committing or rolling back. + * leaving a freed pointer in active_txns lets compaction or SSI dereference it */ + tidesdb_txn_remove_from_active_list(txn); + + /* return whatever this txn published to the global counter so it nets to baseline */ + if (txn->db && txn->mem_published) + atomic_fetch_sub_explicit(&txn->db->txn_memory_bytes, txn->mem_published, + memory_order_relaxed); + + for (int i = 0; i < txn->num_ops; i++) + { + free(txn->ops[i].key); /* coalesced buffer owns key+value */ + } + free(txn->ops); + for (int i = 0; i < txn->read_key_arena_count; i++) + { + free(txn->read_key_arenas[i]); + } + free(txn->read_key_arenas); + free(txn->read_keys); + free(txn->read_key_sizes); + free(txn->read_seqs); + free(txn->read_cfs); + + if (txn->write_set_hash) + { + tidesdb_write_set_hash_free((tidesdb_write_set_hash_t *)txn->write_set_hash); + } + if (txn->read_set_hash) + { + tidesdb_read_set_hash_free((tidesdb_read_set_hash_t *)txn->read_set_hash); + } + + for (int i = 0; i < txn->num_savepoints; i++) + { + free(txn->savepoint_names[i]); + } + free(txn->savepoint_op_counts); + free(txn->savepoint_cf_counts); + free(txn->savepoint_names); + + free(txn->cfs); + free(txn); +} + +int tidesdb_txn_reset(tidesdb_txn_t *txn, const tidesdb_isolation_level_t isolation) +{ + if (!txn || !txn->db) return TDB_ERR_INVALID_ARGS; + if (!txn->is_committed && !txn->is_aborted) return TDB_ERR_INVALID_ARGS; + + if (isolation < TDB_ISOLATION_READ_UNCOMMITTED || isolation > TDB_ISOLATION_SERIALIZABLE) + { + return TDB_ERR_INVALID_ARGS; + } + + const int wait_result = wait_for_open(txn->db); + if (wait_result != TDB_SUCCESS) + { + return wait_result; + } + + /* remove from the active list if the OLD isolation had registered it. registration + * happens for any isolation >= REPEATABLE_READ (see txn create / re-register), so the + * removal condition must match -- a == SERIALIZABLE guard here leaves a stale entry + * for an RR/SNAPSHOT txn that re-registration then duplicates (later a dangling ptr). + * tidesdb_txn_remove_from_active_list self-guards on < REPEATABLE_READ. */ + if (txn->isolation_level >= TDB_ISOLATION_REPEATABLE_READ) + { + tidesdb_txn_remove_from_active_list(txn); + } + + /* we free op key/value data but keep the ops array itself */ + for (int i = 0; i < txn->num_ops; i++) + { + free(txn->ops[i].key); /* coalesced buffer owns key+value */ + txn->ops[i].key = NULL; + txn->ops[i].value = NULL; + } + txn->num_ops = 0; + + /* we reset read set but keep arrays allocated, we also free arena buffers to avoid leaks */ + txn->read_set_count = 0; + + /* we free individual arena buffers but keep the pointer array for reuse */ + for (int i = 0; i < txn->read_key_arena_count; i++) + { + free(txn->read_key_arenas[i]); + txn->read_key_arenas[i] = NULL; + } + txn->read_key_arena_count = 0; + txn->read_key_arena_used = 0; + + /* return this txn's published memory to the global counter and reset the accumulator */ + if (txn->mem_published) + atomic_fetch_sub_explicit(&txn->db->txn_memory_bytes, txn->mem_published, + memory_order_relaxed); + txn->mem_bytes = 0; + txn->mem_published = 0; + + /* we allocate read set arrays if switching to isolation that needs read tracking */ + if ((isolation == TDB_ISOLATION_REPEATABLE_READ || isolation == TDB_ISOLATION_SERIALIZABLE) && + !txn->read_keys) + { + txn->read_set_capacity = TDB_INITIAL_TXN_READ_SET_CAPACITY; + txn->read_keys = calloc(txn->read_set_capacity, sizeof(uint8_t *)); + txn->read_key_sizes = calloc(txn->read_set_capacity, sizeof(size_t)); + txn->read_seqs = calloc(txn->read_set_capacity, sizeof(uint64_t)); + txn->read_cfs = calloc(txn->read_set_capacity, sizeof(tidesdb_column_family_t *)); + + if (!txn->read_keys || !txn->read_key_sizes || !txn->read_seqs || !txn->read_cfs) + { + return TDB_ERR_MEMORY; + } + } + + /* we free hash tables; they contain stale indices. will be rebuilt lazily */ + if (txn->write_set_hash) + { + tidesdb_write_set_hash_free((tidesdb_write_set_hash_t *)txn->write_set_hash); + txn->write_set_hash = NULL; + } + if (txn->read_set_hash) + { + tidesdb_read_set_hash_free((tidesdb_read_set_hash_t *)txn->read_set_hash); + txn->read_set_hash = NULL; + } + + /* we free any savepoints */ + for (int i = 0; i < txn->num_savepoints; i++) + { + free(txn->savepoint_names[i]); + } + txn->num_savepoints = 0; + + /* we reset cf tracking */ + txn->num_cfs = 0; + txn->last_cf = NULL; + txn->last_cf_index = 0; + + /* we assign fresh transaction identity */ + txn->isolation_level = isolation; + txn->txn_id = atomic_fetch_add_explicit(&txn->db->next_txn_id, 1, memory_order_relaxed); + + if (isolation == TDB_ISOLATION_READ_UNCOMMITTED) + { + txn->snapshot_seq = UINT64_MAX; + } + else if (isolation == TDB_ISOLATION_READ_COMMITTED) + { + txn->snapshot_seq = 0; + } + else + { + uint64_t current_seq = atomic_load_explicit(&txn->db->global_seq, memory_order_acquire); + txn->snapshot_seq = (current_seq > 0) ? current_seq - 1 : 0; + } + + txn->commit_seq = 0; + txn->is_committed = 0; + txn->is_aborted = 0; + txn->has_rw_conflict_in = 0; + txn->has_rw_conflict_out = 0; + + /* we re-register in active list if the new isolation fixes a snapshot */ + if (isolation >= TDB_ISOLATION_REPEATABLE_READ) + { + pthread_rwlock_wrlock(&txn->db->active_txns_lock); + + if (txn->db->num_active_txns < txn->db->active_txns_capacity) + { + txn->db->active_txns[txn->db->num_active_txns++] = txn; + } + else + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Active transaction list full (%d), SSI and snapshot retention may be " + "less effective", + txn->db->active_txns_capacity); + } + + pthread_rwlock_unlock(&txn->db->active_txns_lock); + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_check_seq_conflict + * check sequence conflicts in memtable/immutable + * @param sl skip list to check + * @param key key to check + * @param key_size key size + * @param threshold_seq threshold sequence + * @return 1 if conflict, 0 if no conflict + */ +static int tidesdb_txn_check_seq_conflict(skip_list_t *sl, const uint8_t *key, + const size_t key_size, const uint64_t threshold_seq) +{ + if (!sl) return 0; + + uint64_t found_seq = 0; + if (skip_list_get_max_seq(sl, key, key_size, &found_seq) == 0) + { + return (found_seq > threshold_seq) ? 1 : 0; + } + return 0; +} + +/** + * tidesdb_txn_get_imm_snapshot + * get immutable memtable snapshot with refcounting + * @param cf column family to get snapshot for + * @param out_count output parameter for number of immutable memtables + * @return immutable memtable references + */ +static tidesdb_immutable_memtable_t **tidesdb_txn_get_imm_snapshot( + const tidesdb_column_family_t *cf, size_t *out_count) +{ + return tidesdb_snapshot_immutable_memtables((tidesdb_column_family_t *)cf, out_count); +} + +/** + * tidesdb_txn_cleanup_imm_snapshot + * cleanup immutable memtable snapshot + * @param imm_refs immutable memtable references + * @param imm_count number of immutable memtables + */ +static void tidesdb_txn_cleanup_imm_snapshot(tidesdb_immutable_memtable_t **imm_refs, + const size_t imm_count) +{ + if (!imm_refs) return; + for (size_t i = 0; i < imm_count; i++) + { + if (imm_refs[i]) tidesdb_immutable_memtable_unref(imm_refs[i]); + } + free(imm_refs); +} + +/** + * tidesdb_txn_check_sstable_conflict + * check if any sstable in the column family has a newer version of the key + * @param db database handle + * @param cf column family to check + * @param key key to check + * @param key_size key size + * @param threshold_seq threshold sequence + * @return 1 if conflict, 0 if no conflict + */ +static int tidesdb_txn_check_sstable_conflict(tidesdb_t *db, tidesdb_column_family_t *cf, + const uint8_t *key, const size_t key_size, + const uint64_t threshold_seq) +{ + if (!db || !cf) return 0; + + /*** we track highest sequence found across all ssts + ** in L1 (levels[0]), ssts can overlap and newer ones are appended at the end + * we must check all ssts to find the true highest sequence for this key */ + uint64_t max_found_seq = 0; + int found_any = 0; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + for (int level_idx = 0; level_idx < num_levels; level_idx++) + { + tidesdb_level_t *level = cf->levels[level_idx]; + if (!level) continue; + + /** we load array pointer and count with careful ordering to handle concurrent modifications + * re-load count to detect concurrent remove, use minimum to avoid OOB */ + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + tidesdb_sstable_t **sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + int num_sstables = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + /* we re-load count to detect concurrent remove */ + int num_sstables_recheck = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + if (num_sstables_recheck < num_sstables) num_sstables = num_sstables_recheck; + + /* we verify array hasnt changed (handles add-with-resize race) */ + tidesdb_sstable_t **sstables_check = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (sstables_check != sstables) + { + sstables = sstables_check; + num_sstables = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + } + + const int start = (level_idx == 0) ? num_sstables - 1 : 0; + const int end = (level_idx == 0) ? -1 : num_sstables; + const int step = (level_idx == 0) ? -1 : 1; + + for (int sst_idx = start; sst_idx != end; sst_idx += step) + { + tidesdb_sstable_t *sst = sstables[sst_idx]; + if (!sst) continue; + + /* if the highest sequence in this sstable predates our snapshot, + * no entry in it can conflict -- skip without ref, bloom, or I/O */ + if (sst->max_seq <= threshold_seq) continue; + + /* we try to take ref to safely handle concurrent removal */ + if (!tidesdb_sstable_try_ref(sst)) + { + continue; /* sstable is being freed, skip it */ + } + + uint64_t found_seq = 0; + if (tidesdb_sstable_get_seq(db, sst, key, key_size, &found_seq) == TDB_SUCCESS) + { + found_any = 1; + if (found_seq > max_found_seq) + { + max_found_seq = found_seq; + } + if (found_seq > threshold_seq) + { + tidesdb_sstable_unref(db, sst); + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return 1; + } + } + + tidesdb_sstable_unref(db, sst); + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + } + + /** conflict if we found any version with seq > threshold.. */ + return (found_any && max_found_seq > threshold_seq) ? 1 : 0; +} + +/** + * tidesdb_txn_check_key_conflict + * unified conflict check for a single key against memtable, immutables, and sstables + * @param txn transaction + * @param cf column family + * @param key key to check + * @param key_size key size + * @param threshold_seq sequence threshold for conflict detection + * @param imm_refs cached immutable refs (will be refreshed if cf changes) + * @param imm_count count of immutable refs + * @param last_cf pointer to last CF checked (for caching) + * @return TDB_SUCCESS if no conflict, TDB_ERR_CONFLICT if conflict detected + */ +static int tidesdb_txn_check_key_conflict(const tidesdb_txn_t *txn, tidesdb_column_family_t *cf, + const uint8_t *key, const size_t key_size, + const uint64_t threshold_seq, + tidesdb_immutable_memtable_t ***imm_refs, + size_t *imm_count, tidesdb_column_family_t **last_cf) +{ + /* we refresh imm snapshot only when CF changes */ + if (cf != *last_cf) + { + if (*imm_refs) tidesdb_txn_cleanup_imm_snapshot(*imm_refs, *imm_count); + *imm_refs = tidesdb_txn_get_imm_snapshot(cf, imm_count); + *last_cf = cf; + } + + /* we check per-CF active memtable */ + tidesdb_memtable_t *mt = NULL; + int mt_refed = + tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, &mt); + + if (mt_refed && tidesdb_txn_check_seq_conflict(mt->skip_list, key, key_size, threshold_seq)) + { + tidesdb_immutable_memtable_unref(mt); + return TDB_ERR_CONFLICT; + } + if (mt_refed) tidesdb_immutable_memtable_unref(mt); + + /* we check unified memtable if enabled (data lives there, not in per-CF memtable) */ + if (txn->db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = NULL; + const int umt_refed = tidesdb_active_memtable_try_ref( + &txn->db->unified_mt.active_mt_readers, &txn->db->unified_mt.active, &umt); + if (umt_refed) + { + /* we build prefixed key for unified skip list lookup */ + uint8_t pk_stack[TDB_PREFIXED_KEY_STACK_MAX]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + key_size; + uint8_t *pk = pk_total <= sizeof(pk_stack) ? pk_stack : malloc(pk_total); + if (pk) + { + const size_t pk_size = + tdb_build_prefixed_key(cf->unified_cf_index, key, key_size, pk); + if (tidesdb_txn_check_seq_conflict(umt->skip_list, pk, pk_size, threshold_seq)) + { + if (pk != pk_stack) free(pk); + tidesdb_immutable_memtable_unref(umt); + return TDB_ERR_CONFLICT; + } + if (pk != pk_stack) free(pk); + } + tidesdb_immutable_memtable_unref(umt); + } + } + + for (size_t i = 0; i < *imm_count; i++) + { + if (tidesdb_txn_check_seq_conflict((*imm_refs)[i]->skip_list, key, key_size, threshold_seq)) + { + return TDB_ERR_CONFLICT; + } + } + + if (tidesdb_txn_check_sstable_conflict(txn->db, cf, key, key_size, threshold_seq)) + { + return TDB_ERR_CONFLICT; + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_check_read_conflicts + * check read-set for conflicts (repeatable read and higher) + * @param txn transaction to check + * @return TDB_SUCCESS if no conflicts, TDB_ERR_CONFLICT otherwise + */ +static int tidesdb_txn_check_read_conflicts(const tidesdb_txn_t *txn) +{ + if ((txn->isolation_level != TDB_ISOLATION_REPEATABLE_READ && + txn->isolation_level != TDB_ISOLATION_SERIALIZABLE) || + txn->read_set_count == 0) + { + return TDB_SUCCESS; + } + + tidesdb_column_family_t *last_cf = NULL; + tidesdb_immutable_memtable_t **imm_refs = NULL; + size_t imm_count = 0; + + for (int r = 0; r < txn->read_set_count; r++) + { + const int result = tidesdb_txn_check_key_conflict(txn, txn->read_cfs[r], txn->read_keys[r], + txn->read_key_sizes[r], txn->read_seqs[r], + &imm_refs, &imm_count, &last_cf); + + if (result != TDB_SUCCESS) + { + if (imm_refs) tidesdb_txn_cleanup_imm_snapshot(imm_refs, imm_count); + return result; + } + } + + if (imm_refs) tidesdb_txn_cleanup_imm_snapshot(imm_refs, imm_count); + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_check_write_conflicts + * check write-set for conflicts (snapshot isolation and higher) + * @param txn transaction to check + * @return TDB_SUCCESS if no conflicts, TDB_ERR_CONFLICT otherwise + */ +static int tidesdb_txn_check_write_conflicts(const tidesdb_txn_t *txn) +{ + if (txn->isolation_level < TDB_ISOLATION_SNAPSHOT || txn->num_ops == 0) + { + return TDB_SUCCESS; + } + + tidesdb_column_family_t *last_cf = NULL; + tidesdb_immutable_memtable_t **imm_refs = NULL; + size_t imm_count = 0; + + for (int w = 0; w < txn->num_ops; w++) + { + const tidesdb_txn_op_t *op = &txn->ops[w]; + + const int result = tidesdb_txn_check_key_conflict( + txn, op->cf, op->key, op->key_size, txn->snapshot_seq, &imm_refs, &imm_count, &last_cf); + + if (result != TDB_SUCCESS) + { + if (imm_refs) tidesdb_txn_cleanup_imm_snapshot(imm_refs, imm_count); + return result; + } + } + + if (imm_refs) tidesdb_txn_cleanup_imm_snapshot(imm_refs, imm_count); + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_check_ssi_conflicts + * check serializable snapshot isolation conflicts + * @param txn transaction to check + * @return TDB_SUCCESS if no conflicts, TDB_ERR_CONFLICT otherwise + */ +static int tidesdb_txn_check_ssi_conflicts(tidesdb_txn_t *txn) +{ + if (txn->isolation_level != TDB_ISOLATION_SERIALIZABLE) + { + return TDB_SUCCESS; + } + + /**** we hold rdlock for the entire iteration to prevent other threads from + *** removing and freeing their transactions while we dereference them. + ** removal from active list requires wrlock, so all pointers in the + * array remain valid while we hold rdlock. */ + pthread_rwlock_rdlock(&txn->db->active_txns_lock); + const int count = txn->db->num_active_txns; + tidesdb_txn_t **active = txn->db->active_txns; + + /* we detect rw-conflicts. the active list now also holds REPEATABLE_READ and SNAPSHOT + * txns for the compaction snapshot floor, but SSI conflicts only involve other + * SERIALIZABLE peers so we skip the rest */ + for (int i = 0; i < count; i++) + { + tidesdb_txn_t *other = active[i]; + if (other == txn || other->is_committed || other->is_aborted) continue; + if (other->isolation_level != TDB_ISOLATION_SERIALIZABLE) continue; + + if (txn->read_set_hash && txn->read_set_count >= TDB_TXN_READ_HASH_THRESHOLD) + { + for (int w = 0; w < other->num_ops && !txn->has_rw_conflict_out; w++) + { + const tidesdb_txn_op_t *op = &other->ops[w]; + if (tidesdb_read_set_hash_check_conflict( + (tidesdb_read_set_hash_t *)txn->read_set_hash, txn, op->cf, op->key, + op->key_size)) + { + txn->has_rw_conflict_out = 1; + other->has_rw_conflict_in = 1; + break; + } + } + } + else + { + for (int r = 0; r < txn->read_set_count && !txn->has_rw_conflict_out; r++) + { + for (int w = 0; w < other->num_ops; w++) + { + const tidesdb_txn_op_t *op = &other->ops[w]; + if (txn->read_key_sizes[r] == op->key_size && txn->read_cfs[r] == op->cf && + memcmp(txn->read_keys[r], op->key, op->key_size) == 0) + { + txn->has_rw_conflict_out = 1; + other->has_rw_conflict_in = 1; + break; + } + } + } + } + } + + /* we check for dangerous structures */ + int conflict = (txn->has_rw_conflict_in && txn->has_rw_conflict_out); + + if (!conflict && txn->num_ops > 0) + { + for (int i = 0; i < count && !conflict; i++) + { + const tidesdb_txn_t *other = active[i]; + if (other == txn || other->is_committed || other->is_aborted || + !other->has_rw_conflict_in || !other->has_rw_conflict_out) + { + continue; + } + + for (int w = 0; w < txn->num_ops && !conflict; w++) + { + const tidesdb_txn_op_t *op = &txn->ops[w]; + for (int r = 0; r < other->read_set_count; r++) + { + if (op->key_size == other->read_key_sizes[r] && op->cf == other->read_cfs[r] && + memcmp(op->key, other->read_keys[r], op->key_size) == 0) + { + conflict = 1; + break; + } + } + } + } + } + + /* we release rdlock before taking wrlock in remove_from_active_list */ + pthread_rwlock_unlock(&txn->db->active_txns_lock); + + if (conflict) + { + tidesdb_txn_remove_from_active_list(txn); + return TDB_ERR_CONFLICT; + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_txn_apply_ops_to_memtable + * apply transaction operations to a memtable with deduplication + * @param txn transaction + * @param cf column family + * @param memtable skip list to apply to + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_txn_apply_ops_to_memtable(const tidesdb_txn_t *txn, + const tidesdb_column_family_t *cf, + skip_list_t *memtable) +{ + /* we count ops for this CF */ + int cf_op_count = 0; + for (int i = 0; i < txn->num_ops; i++) + { + if (txn->ops[i].cf == cf) cf_op_count++; + } + + if (cf_op_count == 0) return TDB_SUCCESS; + + if (cf_op_count == 1) + { + /* single-op we skip dedup and batch overhead entirely */ + for (int i = txn->num_ops - 1; i >= 0; i--) + { + if (txn->ops[i].cf == cf) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + return skip_list_put_with_seq(memtable, op->key, op->key_size, op->value, + op->value_size, op->ttl, txn->commit_seq, + op->is_delete) == 0 + ? TDB_SUCCESS + : TDB_ERR_MEMORY; + } + } + return TDB_SUCCESS; + } + + if (cf_op_count < TDB_TXN_DEDUP_SKIP_THRESHOLD) + { + /* we build a small batch on the stack after dedup filtering + * skip_list_put_batch benefits from sorted-key hints and batched atomic updates */ + skip_list_batch_entry_t stack_batch[TDB_TXN_DEDUP_SKIP_THRESHOLD]; + int batch_idx = 0; + + for (int i = txn->num_ops - 1; i >= 0; i--) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + + /* we check if this key appears later (newer version exists) */ + int is_superseded = 0; + for (int j = i + 1; j < txn->num_ops; j++) + { + const tidesdb_txn_op_t *later_op = &txn->ops[j]; + if (later_op->cf == cf && later_op->key_size == op->key_size && + memcmp(later_op->key, op->key, op->key_size) == 0) + { + is_superseded = 1; + break; + } + } + if (is_superseded) continue; + + stack_batch[batch_idx].key = op->key; + stack_batch[batch_idx].key_size = op->key_size; + stack_batch[batch_idx].value = op->value; + stack_batch[batch_idx].value_size = op->value_size; + stack_batch[batch_idx].ttl = op->ttl; + stack_batch[batch_idx].seq = txn->commit_seq; + stack_batch[batch_idx].flags = tidesdb_txn_op_sl_flags(op); + batch_idx++; + } + + if (batch_idx > 0) + { + if (skip_list_put_batch(memtable, stack_batch, batch_idx) < 0) + { + return TDB_ERR_MEMORY; + } + } + return TDB_SUCCESS; + } + + int dedup_hash_size = cf_op_count * TDB_TXN_DEDUP_HASH_MULTIPLIER; + if (dedup_hash_size < TDB_TXN_DEDUP_MIN_HASH_SIZE) + dedup_hash_size = TDB_TXN_DEDUP_MIN_HASH_SIZE; + + /** + * dedup_entry_t + * hash table entry for transaction operation deduplication (last-write-wins) + * @param key pointer to the key data (borrowed from txn op, not owned) + * @param key_size size of key in bytes + * @param op_idx index into txn->ops of the newest operation for this key + */ + typedef struct + { + uint8_t *key; + size_t key_size; + int op_idx; + } dedup_entry_t; + + dedup_entry_t *dedup_hash = calloc(dedup_hash_size, sizeof(dedup_entry_t)); + + int *used_slots = NULL; + const int used_slots_capacity = cf_op_count < TDB_TXN_DEDUP_MAX_TRACKED ? cf_op_count : 0; + if (used_slots_capacity > 0) + { + used_slots = malloc(used_slots_capacity * sizeof(int)); + } + + if (!dedup_hash) + { + /* the fallback is to write all ops without dedup */ + free(used_slots); + for (int i = 0; i < txn->num_ops; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + if (skip_list_put_with_seq(memtable, op->key, op->key_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)) != 0) + { + return TDB_ERR_MEMORY; + } + } + return TDB_SUCCESS; + } + + int used_slot_count = 0; + /* we build hash table from newest to oldest (reverse order) */ + for (int i = txn->num_ops - 1; i >= 0; i--) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + + const uint32_t hash = XXH32(op->key, op->key_size, TDB_TXN_HASH_SEED); + int slot = (int)(hash % (uint32_t)dedup_hash_size); + + /* we utilize linear probing to find empty slot or matching key */ + int inserted = 0; + int is_duplicate = 0; + for (int probe = 0; probe < dedup_hash_size; probe++) + { + if (dedup_hash[slot].key == NULL) + { + dedup_hash[slot].key = op->key; + dedup_hash[slot].key_size = op->key_size; + dedup_hash[slot].op_idx = i; + inserted = 1; + if (used_slots && used_slot_count < used_slots_capacity) + { + used_slots[used_slot_count++] = slot; + } + break; + } + if (dedup_hash[slot].key_size == op->key_size && + memcmp(dedup_hash[slot].key, op->key, op->key_size) == 0) + { + is_duplicate = 1; + break; + } + slot = (slot + 1) % dedup_hash_size; + } + + if (!inserted && !is_duplicate) + { + if (skip_list_put_with_seq(memtable, op->key, op->key_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)) != 0) + { + free(dedup_hash); + free(used_slots); + return TDB_ERR_MEMORY; + } + } + } + + int result = TDB_SUCCESS; + const int dedup_count = used_slots ? used_slot_count : cf_op_count; + + if (dedup_count >= TDB_MAX_TXN_OPS_BEFORE_BATCH) + { + skip_list_batch_entry_t *batch_entries = + malloc(dedup_count * sizeof(skip_list_batch_entry_t)); + if (!batch_entries) + { + free(dedup_hash); + free(used_slots); + return TDB_ERR_MEMORY; + } + + int batch_idx = 0; + if (used_slots && used_slot_count > 0) + { + for (int i = 0; i < used_slot_count; i++) + { + const int slot = used_slots[i]; + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[slot].op_idx]; + batch_entries[batch_idx].key = op->key; + batch_entries[batch_idx].key_size = op->key_size; + batch_entries[batch_idx].value = op->value; + batch_entries[batch_idx].value_size = op->value_size; + batch_entries[batch_idx].ttl = op->ttl; + batch_entries[batch_idx].seq = txn->commit_seq; + batch_entries[batch_idx].flags = tidesdb_txn_op_sl_flags(op); + batch_idx++; + } + } + else + { + for (int slot = 0; slot < dedup_hash_size; slot++) + { + if (dedup_hash[slot].key != NULL) + { + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[slot].op_idx]; + batch_entries[batch_idx].key = op->key; + batch_entries[batch_idx].key_size = op->key_size; + batch_entries[batch_idx].value = op->value; + batch_entries[batch_idx].value_size = op->value_size; + batch_entries[batch_idx].ttl = op->ttl; + batch_entries[batch_idx].seq = txn->commit_seq; + batch_entries[batch_idx].flags = tidesdb_txn_op_sl_flags(op); + batch_idx++; + } + } + } + + if (skip_list_put_batch(memtable, batch_entries, batch_idx) < 0) + { + result = TDB_ERR_MEMORY; + } + free(batch_entries); + } + else if (used_slots && used_slot_count > 0) + { + for (int i = 0; i < used_slot_count; i++) + { + const int slot = used_slots[i]; + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[slot].op_idx]; + if (skip_list_put_with_seq(memtable, op->key, op->key_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)) != 0) + { + result = TDB_ERR_MEMORY; + break; + } + } + } + else + { + /* we scan full table (only for very large txns) */ + for (int slot = 0; slot < dedup_hash_size; slot++) + { + if (dedup_hash[slot].key != NULL) + { + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[slot].op_idx]; + if (skip_list_put_with_seq(memtable, op->key, op->key_size, op->value, + op->value_size, op->ttl, txn->commit_seq, + op->is_delete) != 0) + { + result = TDB_ERR_MEMORY; + break; + } + } + } + } + + free(dedup_hash); + free(used_slots); + return result; +} + +/** + * tidesdb_txn_serialize_wal + * serialize a transaction's WAL batch for a column family + * @param txn transaction to serialize + * @param cf column family to serialize for + * @param out_size output parameter for serialized size + * @param stack_buf caller-provided stack buffer for small payloads (may be NULL) + * @param stack_buf_size size of the caller-provided stack buffer + * @return serialized WAL batch (may point to stack_buf or heap-allocated memory) + */ +static uint8_t *tidesdb_txn_serialize_wal(const tidesdb_txn_t *txn, + const tidesdb_column_family_t *cf, size_t *out_size, + uint8_t *stack_buf, const size_t stack_buf_size) +{ + /*** single-pass serialization with pre-sized buffer + ** we estimate size based on average entry overhead + actual key/value sizes + ** overhead per entry -- flags(1) + varints(~15 max) + ttl(8 optional) = ~24 bytes max */ + size_t estimated_size = 0; + int cf_op_count = 0; + + /* we do a quick scan to count ops and estimate size */ + for (int i = 0; i < txn->num_ops; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf == cf) + { + cf_op_count++; + const size_t entry_size = 24 + (size_t)op->key_size + (size_t)op->value_size; + if (estimated_size + entry_size < estimated_size) /* overflow check */ + { + *out_size = 0; + return NULL; + } + estimated_size += entry_size; + } + } + + if (cf_op_count == 0) + { + *out_size = 0; + return NULL; + } + + /* we use caller-provided stack buffer for small payloads to avoid malloc/free per txn */ + uint8_t *wal_batch; + if (stack_buf != NULL && estimated_size <= stack_buf_size) + { + wal_batch = stack_buf; + } + else + { + wal_batch = malloc(estimated_size); + if (!wal_batch) + { + *out_size = estimated_size; /* signal alloc failure */ + return NULL; + } + } + + uint8_t *wal_ptr = wal_batch; + + /* we write operations directly */ + for (int i = 0; i < txn->num_ops; i++) + { + tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + + uint8_t flags = op->is_delete ? TDB_KV_FLAG_TOMBSTONE : 0; + if (op->is_single_delete) flags |= TDB_KV_FLAG_SINGLE_DELETE; + if (op->ttl != 0) flags |= TDB_KV_FLAG_HAS_TTL; + *wal_ptr++ = flags; + + wal_ptr += encode_varint(wal_ptr, op->key_size); + wal_ptr += encode_varint(wal_ptr, op->value_size); + wal_ptr += encode_varint(wal_ptr, txn->commit_seq); + + if (op->ttl != 0) + { + encode_int64_le_compat(wal_ptr, op->ttl); + wal_ptr += sizeof(int64_t); + } + + memcpy(wal_ptr, op->key, op->key_size); + wal_ptr += op->key_size; + + if (op->value_size > 0 && op->value) + { + memcpy(wal_ptr, op->value, op->value_size); + wal_ptr += op->value_size; + } + } + + *out_size = (size_t)(wal_ptr - wal_batch); + return wal_batch; +} + +/** + * tidesdb_txn_serialize_wal_unified + * serialize all transaction ops into a single unified WAL batch + * format per entry -- cf_index(4 BE) + flags(1) + varint(key_size) + varint(value_size) + * + varint(seq) + [ttl(8)] + key + value + * the batch is prefixed with a 2-byte magic (TDB_UNIFIED_WAL_MAGIC) for identification + * @param txn transaction to serialize + * @param out_size output parameter for serialized size + * @param stack_buf caller-provided stack buffer for small payloads + * @param stack_buf_size size of the caller-provided stack buffer + * @return serialized WAL batch (may point to stack_buf or heap-allocated memory) + */ +static uint8_t *tidesdb_txn_serialize_wal_unified(const tidesdb_txn_t *txn, size_t *out_size, + uint8_t *stack_buf, const size_t stack_buf_size) +{ + if (txn->num_ops == 0) + { + *out_size = 0; + return NULL; + } + + /* we estimate size 2 (magic) + per-entry overhead */ + size_t estimated_size = 2; /* magic */ + for (int i = 0; i < txn->num_ops; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + const size_t entry_size = + TDB_UNIFIED_CF_PREFIX_SIZE + 24 + (size_t)op->key_size + (size_t)op->value_size; + if (estimated_size + entry_size < estimated_size) /* overflow check */ + { + *out_size = 0; + return NULL; + } + estimated_size += entry_size; + } + + uint8_t *wal_batch; + if (stack_buf != NULL && estimated_size <= stack_buf_size) + { + wal_batch = stack_buf; + } + else + { + wal_batch = malloc(estimated_size); + if (!wal_batch) + { + *out_size = estimated_size; + return NULL; + } + } + + uint8_t *wal_ptr = wal_batch; + + /* we write magic */ + wal_ptr[0] = (uint8_t)(TDB_UNIFIED_WAL_MAGIC >> 8); + wal_ptr[1] = (uint8_t)(TDB_UNIFIED_WAL_MAGIC & 0xFF); + wal_ptr += TDB_UNIFIED_WAL_MAGIC_SIZE; + + for (int i = 0; i < txn->num_ops; i++) + { + tidesdb_txn_op_t *op = &txn->ops[i]; + + /* we write CF index */ + tdb_encode_be32(op->cf->unified_cf_index, wal_ptr); + wal_ptr += TDB_UNIFIED_CF_PREFIX_SIZE; + + uint8_t flags = op->is_delete ? TDB_KV_FLAG_TOMBSTONE : 0; + if (op->is_single_delete) flags |= TDB_KV_FLAG_SINGLE_DELETE; + if (op->ttl != 0) flags |= TDB_KV_FLAG_HAS_TTL; + *wal_ptr++ = flags; + + wal_ptr += encode_varint(wal_ptr, op->key_size); + wal_ptr += encode_varint(wal_ptr, op->value_size); + wal_ptr += encode_varint(wal_ptr, txn->commit_seq); + + if (op->ttl != 0) + { + encode_int64_le_compat(wal_ptr, op->ttl); + wal_ptr += sizeof(int64_t); + } + + memcpy(wal_ptr, op->key, op->key_size); + wal_ptr += op->key_size; + + if (op->value_size > 0 && op->value) + { + memcpy(wal_ptr, op->value, op->value_size); + wal_ptr += op->value_size; + } + } + + *out_size = (size_t)(wal_ptr - wal_batch); + return wal_batch; +} + +/** + * tidesdb_txn_apply_ops_to_unified_memtable + * apply all transaction operations to the unified skip list with prefixed keys + * keys are prefixed with 4-byte BE CF index for isolation + * uses O(n) hash-based dedup (same as non-unified path) + skip_list_put_batch + * @param txn transaction + * @param memtable unified skip list + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_txn_apply_ops_to_unified_memtable(const tidesdb_txn_t *txn, + skip_list_t *memtable) +{ + if (txn->num_ops == 0) return TDB_SUCCESS; + + /* single-op fast path, we skip dedup and batch overhead entirely */ + if (txn->num_ops == 1) + { + const tidesdb_txn_op_t *op = &txn->ops[0]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack2); + if (!prefixed) return TDB_ERR_MEMORY; + size_t pk_size = + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, prefixed); + int rc = skip_list_put_with_seq(memtable, prefixed, pk_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)) == 0 + ? TDB_SUCCESS + : TDB_ERR_MEMORY; + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack2); + return rc; + } + + const int num_ops = txn->num_ops; + + /* small-txn path -- O(n²) dedup is acceptable for tiny batches, we use stack batch + put_batch + */ + if (num_ops < TDB_TXN_DEDUP_SKIP_THRESHOLD) + { + skip_list_batch_entry_t stack_batch[TDB_TXN_DEDUP_SKIP_THRESHOLD]; + /* prefixed key storage on the stack for small txns */ + uint8_t pk_buf[TDB_TXN_DEDUP_SKIP_THRESHOLD * + (TDB_UNIFIED_CF_PREFIX_SIZE + TDB_PREFIXED_KEY_STACK_MAX)]; + size_t pk_buf_used = 0; + int batch_idx = 0; + + for (int i = num_ops - 1; i >= 0; i--) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + + int is_superseded = 0; + for (int j = i + 1; j < num_ops; j++) + { + const tidesdb_txn_op_t *later = &txn->ops[j]; + if (later->cf == op->cf && later->key_size == op->key_size && + memcmp(later->key, op->key, op->key_size) == 0) + { + is_superseded = 1; + break; + } + } + if (is_superseded) continue; + + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + uint8_t *pk_dest = pk_buf + pk_buf_used; + if (pk_buf_used + pk_total > sizeof(pk_buf)) + { + /* too large for stack, we use individual puts */ + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack_fb); + if (!prefixed) return TDB_ERR_MEMORY; + size_t pk_size = tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, + op->key_size, prefixed); + int rc = + skip_list_put_with_seq(memtable, prefixed, pk_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)); + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack_fb); + if (rc != 0) return TDB_ERR_MEMORY; + continue; + } + + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, pk_dest); + pk_buf_used += pk_total; + + stack_batch[batch_idx].key = pk_dest; + stack_batch[batch_idx].key_size = pk_total; + stack_batch[batch_idx].value = op->value; + stack_batch[batch_idx].value_size = op->value_size; + stack_batch[batch_idx].ttl = op->ttl; + stack_batch[batch_idx].seq = txn->commit_seq; + stack_batch[batch_idx].flags = tidesdb_txn_op_sl_flags(op); + batch_idx++; + } + + if (batch_idx > 0) + { + if (skip_list_put_batch(memtable, stack_batch, batch_idx) < 0) return TDB_ERR_MEMORY; + } + return TDB_SUCCESS; + } + + /*** large-txn path O(n) hash-based dedup + skip_list_put_batch with prefixed keys + ** mirrors the non-unified tidesdb_txn_apply_ops_to_memtable hash path + * we use power-of-2 hash size so slot = hash & mask (avoids expensive div) */ + int dedup_hash_size = num_ops * TDB_TXN_DEDUP_HASH_MULTIPLIER; + if (dedup_hash_size < TDB_TXN_DEDUP_MIN_HASH_SIZE) + dedup_hash_size = TDB_TXN_DEDUP_MIN_HASH_SIZE; + /* we round up to next power of 2 */ + { + int v = dedup_hash_size - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + dedup_hash_size = v + 1; + } + const uint32_t dedup_hash_mask = (uint32_t)(dedup_hash_size - 1); + + typedef struct + { + const uint8_t *key; + size_t key_size; + const tidesdb_column_family_t *cf; + int op_idx; + } unified_dedup_entry_t; + + unified_dedup_entry_t *dedup_hash = calloc(dedup_hash_size, sizeof(unified_dedup_entry_t)); + + int *used_slots = NULL; + const int used_slots_capacity = num_ops < TDB_TXN_DEDUP_MAX_TRACKED ? num_ops : 0; + if (used_slots_capacity > 0) used_slots = malloc(used_slots_capacity * sizeof(int)); + + if (!dedup_hash) + { + /* we write all ops without dedup */ + free(used_slots); + for (int i = 0; i < num_ops; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack_ndd); + if (!prefixed) return TDB_ERR_MEMORY; + size_t pk_size = + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, prefixed); + int rc = skip_list_put_with_seq(memtable, prefixed, pk_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)); + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack_ndd); + if (rc != 0) return TDB_ERR_MEMORY; + } + return TDB_SUCCESS; + } + + int used_slot_count = 0; + + /* we build hash from newest to oldest (last write wins) */ + for (int i = num_ops - 1; i >= 0; i--) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + + /* the hash includes CF index to distinguish same-key across different CFs */ + uint8_t hash_buf[TDB_UNIFIED_CF_PREFIX_SIZE + TDB_PREFIXED_KEY_STACK_MAX]; + uint8_t *hash_key; + size_t hash_key_size = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + if (hash_key_size <= sizeof(hash_buf)) + { + hash_key = hash_buf; + } + else + { + hash_key = malloc(hash_key_size); + if (!hash_key) continue; + } + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, hash_key); + + const uint32_t hash = XXH32(hash_key, hash_key_size, TDB_TXN_HASH_SEED); + int slot = (int)(hash & dedup_hash_mask); + + int inserted = 0; + int is_duplicate = 0; + for (int probe = 0; probe < TDB_TXN_MAX_PROBE_LENGTH; probe++) + { + if (dedup_hash[slot].key == NULL) + { + dedup_hash[slot].key = op->key; + dedup_hash[slot].key_size = op->key_size; + dedup_hash[slot].cf = op->cf; + dedup_hash[slot].op_idx = i; + inserted = 1; + if (used_slots && used_slot_count < used_slots_capacity) + used_slots[used_slot_count++] = slot; + break; + } + if (dedup_hash[slot].cf == op->cf && dedup_hash[slot].key_size == op->key_size && + memcmp(dedup_hash[slot].key, op->key, op->key_size) == 0) + { + is_duplicate = 1; + break; + } + slot = (slot + 1) & (int)dedup_hash_mask; + } + + if (hash_key != hash_buf) free(hash_key); + + if (!inserted && !is_duplicate) + { + /* we probe chain exhausted, then insert without dedup */ + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack_probe); + if (!prefixed) continue; + size_t pk_size = + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, prefixed); + (void)skip_list_put_with_seq(memtable, prefixed, pk_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)); + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack_probe); + } + } + + /* we collect deduplicated ops and apply via skip_list_put_batch */ + const int dedup_count = used_slots ? used_slot_count : num_ops; + int result = TDB_SUCCESS; + + /* we allocate prefixed key storage + batch entries */ + skip_list_batch_entry_t *batch_entries = malloc(dedup_count * sizeof(skip_list_batch_entry_t)); + /* we estimate max prefixed key storage needed */ + size_t pk_arena_size = 0; + if (used_slots && used_slot_count > 0) + { + for (int i = 0; i < used_slot_count; i++) + { + pk_arena_size += + TDB_UNIFIED_CF_PREFIX_SIZE + txn->ops[dedup_hash[used_slots[i]].op_idx].key_size; + } + } + else + { + for (int slot = 0; slot < dedup_hash_size; slot++) + { + if (dedup_hash[slot].key != NULL) + pk_arena_size += + TDB_UNIFIED_CF_PREFIX_SIZE + txn->ops[dedup_hash[slot].op_idx].key_size; + } + } + + uint8_t *pk_arena = NULL; + if (batch_entries) pk_arena = malloc(pk_arena_size); + + if (!batch_entries || !pk_arena) + { + free(batch_entries); + free(pk_arena); + /* individual puts */ + if (used_slots && used_slot_count > 0) + { + for (int i = 0; i < used_slot_count; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[used_slots[i]].op_idx]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack_fb2); + if (!prefixed) continue; + size_t pk_size = tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, + op->key_size, prefixed); + (void)skip_list_put_with_seq(memtable, prefixed, pk_size, op->value, op->value_size, + op->ttl, txn->commit_seq, tidesdb_txn_op_sl_flags(op)); + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack_fb2); + } + } + free(dedup_hash); + free(used_slots); + return TDB_SUCCESS; + } + + int batch_idx = 0; + size_t pk_arena_used = 0; + + if (used_slots && used_slot_count > 0) + { + for (int i = 0; i < used_slot_count; i++) + { + const int slot = used_slots[i]; + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[slot].op_idx]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + + uint8_t *pk_dest = pk_arena + pk_arena_used; + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, pk_dest); + pk_arena_used += pk_total; + + batch_entries[batch_idx].key = pk_dest; + batch_entries[batch_idx].key_size = pk_total; + batch_entries[batch_idx].value = op->value; + batch_entries[batch_idx].value_size = op->value_size; + batch_entries[batch_idx].ttl = op->ttl; + batch_entries[batch_idx].seq = txn->commit_seq; + batch_entries[batch_idx].flags = tidesdb_txn_op_sl_flags(op); + batch_idx++; + } + } + else + { + for (int slot = 0; slot < dedup_hash_size; slot++) + { + if (dedup_hash[slot].key != NULL) + { + const tidesdb_txn_op_t *op = &txn->ops[dedup_hash[slot].op_idx]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + op->key_size; + + uint8_t *pk_dest = pk_arena + pk_arena_used; + tdb_build_prefixed_key(op->cf->unified_cf_index, op->key, op->key_size, pk_dest); + pk_arena_used += pk_total; + + batch_entries[batch_idx].key = pk_dest; + batch_entries[batch_idx].key_size = pk_total; + batch_entries[batch_idx].value = op->value; + batch_entries[batch_idx].value_size = op->value_size; + batch_entries[batch_idx].ttl = op->ttl; + batch_entries[batch_idx].seq = txn->commit_seq; + batch_entries[batch_idx].flags = tidesdb_txn_op_sl_flags(op); + batch_idx++; + } + } + } + + if (batch_idx > 0) + { + if (skip_list_put_batch(memtable, batch_entries, batch_idx) < 0) result = TDB_ERR_MEMORY; + } + + free(batch_entries); + free(pk_arena); + free(dedup_hash); + free(used_slots); + return result; +} + +/** + * tidesdb_find_cf_by_unified_index + * find a column family by its unified_cf_index + * caller must hold db->cf_list_lock (read or write) + * @param db database instance + * @param cf_index unified CF index to find + * @return column family pointer, or NULL if not found + */ +static tidesdb_column_family_t *tidesdb_find_cf_by_unified_index(tidesdb_t *db, uint32_t cf_index) +{ + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i] && db->column_families[i]->unified_cf_index == cf_index) + { + return db->column_families[i]; + } + } + return NULL; +} + +/** + * tidesdb_unified_split_t + * one cf's run located during unified flush phase 1 -- its column family, the cf_index prefix that + * bounds its run in the shared unified skip list, and the run's node count for sstable sizing. the + * per-cf flush task writes that run straight from the unified skip list, so there is no temp copy. + */ +typedef struct +{ + tidesdb_column_family_t *cf; + uint32_t cf_index; + int entry_count; +} tidesdb_unified_split_t; + +/** + * tidesdb_unified_close_wal + * close, optionally upload, and unlink the unified wal backing umt_imm. respects + * the object store replicate_wal and wal_upload_sync config. + */ +static void tidesdb_unified_close_wal(tidesdb_t *db, tidesdb_memtable_t *umt_imm, int persisted) +{ + if (!umt_imm->wal) return; + + char *wal_path = tdb_strdup(umt_imm->wal->file_path); + const uint64_t imm_gen = umt_imm->generation; + block_manager_close(umt_imm->wal); + umt_imm->wal = NULL; + if (!wal_path) return; + + /* a per-cf sstable write or manifest commit in this flush failed, so some cf's data + * is not durably recorded; retain the shared wal (fd already closed) so recovery can + * replay it instead of losing those entries. a later flush re-persists and cleans it. */ + if (!persisted) + { + free(wal_path); + return; + } + + if (db->object_store && db->config.object_store_config && + db->config.object_store_config->replicate_wal) + { + if (db->config.object_store_config->wal_upload_sync) + { + tdb_objstore_upload_file_sync(db, wal_path); + tdb_unlink(wal_path); + tdb_sync_directory(db->db_path); + } + else + { + /** async upload with the wal generation for fence tracking. the reaper + * cleans up the local file after the upload confirms. */ + tdb_objstore_enqueue_upload(db, wal_path, imm_gen); + } + } + else + { + tdb_unlink(wal_path); + tdb_sync_directory(db->db_path); + } + free(wal_path); +} + +/** + * tidesdb_unified_write_cf_sstable + * write cf's cf_index prefix segment of the shared unified skip list as a fresh l1 sstable, commit + * the manifest, and trigger compaction if thresholds are met. unified_sl is borrowed (the immutable + * owns it); entry_count sizes the sstable bloom/index for the segment. + */ +static int tidesdb_unified_write_cf_sstable(tidesdb_t *db, tidesdb_column_family_t *cf, + skip_list_t *unified_sl, uint32_t cf_index, + int entry_count) +{ + if (!db || !cf || !unified_sl) return TDB_ERR_INVALID_ARGS; + + const uint64_t sst_id = atomic_fetch_add(&cf->next_sstable_id, 1); + char sst_path[MAX_FILE_PATH_LENGTH]; + snprintf(sst_path, sizeof(sst_path), "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "1", cf->directory); + + tidesdb_sstable_t *sst = tidesdb_sstable_create(db, sst_path, sst_id, &cf->config); + if (!sst) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Unified flush for CF '%s' SSTable creation failed", cf->name); + return TDB_ERR_IO; + } + + uint8_t seg_prefix[TDB_UNIFIED_CF_PREFIX_SIZE]; + tdb_encode_be32(cf_index, seg_prefix); + + int wr; + if (cf->config.use_btree) + wr = tidesdb_sstable_write_from_memtable_btree_ex(db, cf, sst, unified_sl, seg_prefix, + TDB_UNIFIED_CF_PREFIX_SIZE, entry_count); + else + wr = tidesdb_sstable_write_from_memtable_ex(db, cf, sst, unified_sl, seg_prefix, + TDB_UNIFIED_CF_PREFIX_SIZE, entry_count); + + if (wr != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Unified flush for CF '%s' SSTable write failed (error %d)", + cf->name, wr); + tidesdb_sstable_unref(db, sst); + return wr; + } + + /* the write may have returned success after aborting mid-loop; do not publish a partial + * sstable to the level or manifest, and do not enqueue a fresh compaction for a CF the + * caller is about to free. remove_directory will sweep the on-disk klog/vlog. */ + if (tidesdb_cf_abort_requested(cf)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Unified flush for CF '%s' marked for deletion, discarding SSTable %" PRIu64, + cf->name, sst_id); + tidesdb_sstable_unref(db, sst); + return TDB_SUCCESS; + } + + tidesdb_block_managers_t bms; + if (tidesdb_sstable_get_block_managers(db, sst, &bms) == TDB_SUCCESS) + { + if (bms.klog_bm) block_manager_escalate_fsync(bms.klog_bm); + if (bms.vlog_bm) block_manager_escalate_fsync(bms.vlog_bm); + } + /* the write opened the klog via tidesdb_sstable_ensure_open, which counted it in + * num_open_sstables (the count is keyed on the klog). closing it here must drop that count or + * num_open leaks one per flush -- the published sstable carries klog_bm == NULL, so the reaper, + * which only reclaims klog-open in-level sstables, can never bring the count back down and it + * climbs until it pegs max_open_sstables and reads start backing off with TDB_ERR_BUSY */ + const int had_open_klog = (sst->klog_bm != NULL); + if (sst->klog_bm) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + } + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + if (had_open_klog) atomic_fetch_sub(&db->num_open_sstables, 1); + + /* drop may have fired during the fsync/close above; check once more before publishing + * to the level so we do not leave a fresh sstable behind for remove_directory to race */ + if (tidesdb_cf_abort_requested(cf)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Unified flush for CF '%s' marked for deletion, discarding SSTable %" PRIu64, + cf->name, sst_id); + tidesdb_sstable_unref(db, sst); + return TDB_SUCCESS; + } + + tidesdb_level_add_sstable(cf->levels[0], sst); + tidesdb_bump_sstable_layout_version(cf); + + tidesdb_manifest_add_sstable(cf->manifest, 1, sst_id, sst->num_entries, + sst->klog_size + sst->vlog_size); + atomic_store(&cf->manifest->sequence, atomic_load(&cf->next_sstable_id)); + const int manifest_result = tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + if (manifest_result != 0) + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "Unified flush CF '%s' failed to commit manifest for SSTable %" PRIu64 + " (error: %d)", + cf->name, sst_id, manifest_result); + else + tdb_objstore_upload_manifest(db, cf); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified flush for CF '%s' SSTable %" PRIu64 " written", cf->name, + sst_id); + + int num_l1 = atomic_load_explicit(&cf->levels[0]->num_sstables, memory_order_acquire); + int density_hit = 0; + int density_witness_level = 0; + uint8_t *density_min_key = NULL, *density_max_key = NULL; + size_t density_min_key_size = 0, density_max_key_size = 0; + if (cf->config.tombstone_density_trigger > 0.0) + { + const uint64_t min_entries = cf->config.tombstone_density_min_entries + ? cf->config.tombstone_density_min_entries + : TDB_DEFAULT_TOMBSTONE_DENSITY_MIN_ENTRIES; + density_hit = tidesdb_cf_dense_tombstone_witness( + cf, cf->config.tombstone_density_trigger, min_entries, &density_witness_level, NULL, + &density_min_key, &density_min_key_size, &density_max_key, &density_max_key_size); + } + + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + if (density_hit && density_witness_level > 0 && density_witness_level < num_levels && + density_min_key && density_max_key) + { + /* steer the dense sstable's range down to the largest level so its + * regular tombstones reach where they can drop; ownership of the key + * copies passes to the steer helper */ + tidesdb_compact_steer_to_bottom(cf, density_min_key, density_min_key_size, density_max_key, + density_max_key_size); + density_min_key = NULL; + density_max_key = NULL; + } + else if (num_l1 >= tdb_cf_effective_l1_trigger(cf) || density_hit) + { + /* auto-compaction trigger -- geometry-driven, not a full merge */ + tidesdb_enqueue_compaction(cf, 0); + } + + /* free the witness key copies if the steer path did not take ownership */ + free(density_min_key); + free(density_max_key); + + tidesdb_sstable_unref(db, sst); + /* propagate a failed manifest commit so the barrier retains the shared wal -- the + * sstable is in-memory only and recovery would otherwise orphan-delete it */ + return manifest_result == 0 ? TDB_SUCCESS : TDB_ERR_IO; +} + +/** + * tidesdb_unified_flush_barrier_finish + * decrement the per-cf task barrier. the task that brings remaining to zero owns + * the unified wal cleanup, the flushed flag transition, and the barrier free. + * earlier finishers just decrement and return. + */ +static void tidesdb_unified_flush_barrier_finish(tidesdb_unified_flush_barrier_t *barrier) +{ + if (!barrier) return; + if (atomic_fetch_sub_explicit(&barrier->remaining, 1, memory_order_acq_rel) != 1) return; + + tidesdb_unified_close_wal( + barrier->db, barrier->umt_imm, + atomic_load_explicit(&barrier->overall_result, memory_order_acquire) == TDB_SUCCESS); + atomic_store_explicit(&barrier->umt_imm->flushed, 1, memory_order_release); + free(barrier); +} + +/** + * tidesdb_unified_flush_immutable + * flush a unified immutable memtable by demuxing entries into per-cf sstables. + * entries are sorted by a four byte big-endian cf_index followed by the user + * key, so consecutive entries with the same prefix belong to the same cf. + * phase one walks the cursor and builds a temp skip list per cf in memory. + * phase two enqueues a per-cf flush task for each non-empty cf onto the shared + * flush queue so workers write the per-cf sstables in parallel rather than + * sequentially within one worker. the last task to finish closes the unified + * wal and marks the memtable flushed. per-cf io errors are recorded on the + * barrier and logged by the workers. + */ +static int tidesdb_unified_flush_immutable(tidesdb_t *db, tidesdb_memtable_t *umt_imm) +{ + if (!db || !umt_imm || !umt_imm->skip_list) return TDB_ERR_INVALID_ARGS; + + /* we wait for all in-flight writers to finish before reading from memtable. + * writers bump umt_imm->writers while they mutate the WAL and skip list, so + * once this drains to zero no thread is touching either and closing the WAL + * at the end of the flush is safe. we deliberately drain writers and not + * refcount -- concurrent readers and iterators pin the immutable through + * refcount, and waiting on refcount would let sustained read load stall the + * flush forever while the immutable queue grows unbounded. readers only read + * the skip list, which is safe to do alongside the flush. */ + int drain_iterations = 0; + while (atomic_load_explicit(&umt_imm->writers, memory_order_acquire) > 0) + { + drain_iterations++; + if (drain_iterations < TDB_REFCOUNT_DRAIN_SPIN_THRESHOLD) + { + cpu_pause(); + } + else if (drain_iterations < TDB_REFCOUNT_DRAIN_YIELD_THRESHOLD) + { + cpu_yield(); + } + else + { + usleep(TDB_REFCOUNT_DRAIN_SLEEP_US); + } + if ((drain_iterations & TDB_REFCOUNT_DRAIN_LOG_INTERVAL) == 0) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "Unified flush worker waiting for in-flight writers to drain (current=%d)", + atomic_load_explicit(&umt_imm->writers, memory_order_acquire)); + } + } + atomic_thread_fence(memory_order_acquire); + + /* snapshot floor -- versions strictly above this seq are still needed by some + * active reader and must survive the flush. once we emit a version <= floor for + * a given key, no older version on that key is needed by any current snapshot */ + const uint64_t min_snapshot_seq = tidesdb_min_active_snapshot_seq(db); + + skip_list_cursor_t *cursor = NULL; + if (skip_list_cursor_init(&cursor, umt_imm->skip_list) != 0) return TDB_ERR_MEMORY; + + if (skip_list_cursor_goto_first(cursor) != 0) + { + skip_list_cursor_free(cursor); + tidesdb_unified_close_wal(db, umt_imm, 1); + atomic_store_explicit(&umt_imm->flushed, 1, memory_order_release); + return TDB_SUCCESS; + } + + (void)min_snapshot_seq; /* phase 2's writer applies the snapshot floor while streaming */ + + tidesdb_unified_split_t *splits = NULL; + int split_count = 0; + int split_cap = 0; + int phase1_result = TDB_SUCCESS; + + /* phase 1 is a light scan -- it walks the unified skip list once just to locate each cf's + * contiguous cf_index run and count its nodes. it does NOT rebuild the data; phase 2's per-cf + * task streams each run straight from the unified skip list (entries within a run are already + * in memcmp order, which is the cf order since unified mode forbids custom comparators). */ + uint32_t current_cf_index = UINT32_MAX; + tidesdb_column_family_t *current_cf = NULL; + int current_count = 0; + + do + { + reprocess_current_entry:; + uint8_t *raw_key, *value; + size_t raw_key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &raw_key, &raw_key_size, &value, &value_size, + &ttl, &deleted, &seq) != 0) + continue; + + if (raw_key_size < TDB_UNIFIED_CF_PREFIX_SIZE) continue; + + const uint32_t cf_index = tdb_decode_be32(raw_key); + + /* drop marked the CF mid-segment -- abandon its run and fast-forward past the rest so we + * do not pay the per-entry decode + branch cost for every remaining entry of a dropping CF + */ + if (current_cf && cf_index == current_cf_index && tidesdb_cf_abort_requested(current_cf)) + { + current_cf = NULL; + current_count = 0; + if (tdb_unified_dispatch_skip_segment(cursor, cf_index)) goto reprocess_current_entry; + break; + } + + if (cf_index != current_cf_index) + { + /* a new cf_index starts a new run -- record the run that just ended as a split */ + if (current_cf) + { + if (split_count == split_cap) + { + int new_cap = split_cap == 0 ? TDB_UNIFIED_SPLITS_INITIAL_CAP : split_cap * 2; + tidesdb_unified_split_t *grown = + realloc(splits, (size_t)new_cap * sizeof(*grown)); + if (!grown) + { + phase1_result = TDB_ERR_MEMORY; + break; + } + splits = grown; + split_cap = new_cap; + } + splits[split_count].cf = current_cf; + splits[split_count].cf_index = current_cf_index; + splits[split_count].entry_count = current_count; + split_count++; + } + + current_count = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + current_cf = tidesdb_find_cf_by_unified_index(db, cf_index); + pthread_rwlock_unlock(&db->cf_list_lock); + current_cf_index = cf_index; + + /* a CF marked for deletion still resolves until the drop list-shift completes; + * treat it like an unresolved CF so the dispatcher skips its slice rather than + * writing an sstable we are about to unlink */ + if (current_cf && tidesdb_cf_abort_requested(current_cf)) + { + current_cf = NULL; + } + + if (!current_cf) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Unified flush for CF index %u not found, skipping entries", + cf_index); + if (tdb_unified_dispatch_skip_segment(cursor, cf_index)) + goto reprocess_current_entry; + break; + } + } + + if (current_cf) current_count++; + } while (skip_list_cursor_next(cursor) == 0); + + /* record the final run (the loop ends without a cf_index change to flush it) */ + if (current_cf) + { + if (split_count == split_cap) + { + int new_cap = split_cap == 0 ? TDB_UNIFIED_SPLITS_INITIAL_CAP : split_cap * 2; + tidesdb_unified_split_t *grown = realloc(splits, (size_t)new_cap * sizeof(*grown)); + if (!grown) + phase1_result = TDB_ERR_MEMORY; + else + { + splits = grown; + split_cap = new_cap; + } + } + if (split_count < split_cap) + { + splits[split_count].cf = current_cf; + splits[split_count].cf_index = current_cf_index; + splits[split_count].entry_count = current_count; + split_count++; + } + } + + skip_list_cursor_free(cursor); + + if (split_count == 0) + { + free(splits); + tidesdb_unified_close_wal(db, umt_imm, 1); + atomic_store_explicit(&umt_imm->flushed, 1, memory_order_release); + return phase1_result; + } + + skip_list_t *unified_sl = umt_imm->skip_list; + + tidesdb_unified_flush_barrier_t *barrier = malloc(sizeof(*barrier)); + if (!barrier) + { + /* on barrier alloc failure write inline so we do not lose data */ + int rc = phase1_result; + for (int i = 0; i < split_count; i++) + { + const int wr = tidesdb_unified_write_cf_sstable( + db, splits[i].cf, unified_sl, splits[i].cf_index, splits[i].entry_count); + if (wr != TDB_SUCCESS) rc = wr; + } + free(splits); + tidesdb_unified_close_wal(db, umt_imm, rc == TDB_SUCCESS); + atomic_store_explicit(&umt_imm->flushed, 1, memory_order_release); + return rc; + } + + atomic_init(&barrier->remaining, split_count); + atomic_init(&barrier->overall_result, TDB_SUCCESS); + barrier->umt_imm = umt_imm; + barrier->db = db; + + for (int i = 0; i < split_count; i++) + { + tidesdb_flush_work_t *work = malloc(sizeof(*work)); + if (!work) + { + int wr = tidesdb_unified_write_cf_sstable(db, splits[i].cf, unified_sl, + splits[i].cf_index, splits[i].entry_count); + if (wr != TDB_SUCCESS) + { + int expected = TDB_SUCCESS; + atomic_compare_exchange_strong_explicit(&barrier->overall_result, &expected, wr, + memory_order_acq_rel, memory_order_relaxed); + } + tidesdb_unified_flush_barrier_finish(barrier); + continue; + } + + work->cf = splits[i].cf; + work->imm = NULL; + work->sst_id = 0; + work->unified_sl = unified_sl; + work->unified_cf_index = splits[i].cf_index; + work->unified_entry_count = splits[i].entry_count; + work->unified_barrier = barrier; + + atomic_fetch_add_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_add_explicit(&splits[i].cf->flush_pending_count, 1, memory_order_release); + if (queue_enqueue(db->flush_queue, work) != 0) + { + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&splits[i].cf->flush_pending_count, 1, memory_order_release); + int wr = tidesdb_unified_write_cf_sstable(db, splits[i].cf, unified_sl, + splits[i].cf_index, splits[i].entry_count); + if (wr != TDB_SUCCESS) + { + int expected = TDB_SUCCESS; + atomic_compare_exchange_strong_explicit(&barrier->overall_result, &expected, wr, + memory_order_acq_rel, memory_order_relaxed); + } + free(work); + tidesdb_unified_flush_barrier_finish(barrier); + } + } + + free(splits); + return phase1_result; +} + +/** + * tidesdb_unified_wal_group_sync + * group commit -- coalesce the fdatasync of concurrent committers on the unified WAL. one + * committer (the leader) fdatasyncs the WAL once, making every committer whose bytes were + * already written durable; the rest wait for it instead of each issuing their own fsync. + * durability is preserved -- a commit returns only once the WAL is fdatasync'd past its end + * offset. durable progress is tracked per WAL (on the block manager), so a rotation that + * swaps the active WAL cannot make a new-WAL committer see old-WAL durability. + * @param db database instance + * @param wal the committer's pinned unified WAL block manager + * @param my_end the WAL offset that must be durable before this commit returns + * @return 0 on success, -1 if the fdatasync failed + */ +static int tidesdb_unified_wal_group_sync(tidesdb_t *db, block_manager_t *wal, uint64_t my_end) +{ + /* fast path -- a recent leader already flushed past us */ + if (atomic_load_explicit(&wal->group_durable_size, memory_order_acquire) >= my_end) return 0; + + pthread_mutex_lock(&db->unified_mt.wal_group_sync_lock); + while (atomic_load_explicit(&wal->group_durable_size, memory_order_relaxed) < my_end) + { + if (wal->group_sync_active) + { + /* follower -- wait for the in-flight leader's fsync to publish */ + pthread_cond_wait(&db->unified_mt.wal_group_sync_cond, + &db->unified_mt.wal_group_sync_lock); + continue; + } + + /* leader -- capture the high-water, fsync once, publish */ + wal->group_sync_active = 1; + const uint64_t flush_to = + atomic_load_explicit(&wal->current_file_size, memory_order_acquire); + pthread_mutex_unlock(&db->unified_mt.wal_group_sync_lock); + + const int rc = block_manager_escalate_fsync(wal); + + pthread_mutex_lock(&db->unified_mt.wal_group_sync_lock); + if (rc == 0 && + flush_to > atomic_load_explicit(&wal->group_durable_size, memory_order_relaxed)) + atomic_store_explicit(&wal->group_durable_size, flush_to, memory_order_release); + wal->group_sync_active = 0; + pthread_cond_broadcast(&db->unified_mt.wal_group_sync_cond); + if (rc != 0) + { + pthread_mutex_unlock(&db->unified_mt.wal_group_sync_lock); + return -1; + } + } + pthread_mutex_unlock(&db->unified_mt.wal_group_sync_lock); + return 0; +} + +/** + * tidesdb_unified_memtable_rotate + * rotate the unified active memtable -- push current to immutable queue, create new active + * caller must hold db->unified_mt.is_flushing CAS admission (set to 1) + * @param db database instance + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_unified_memtable_rotate(tidesdb_t *db) +{ + tidesdb_memtable_t *old_mt = atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (!old_mt) return TDB_ERR_UNKNOWN; + + const uint64_t new_gen = + atomic_fetch_add_explicit(&db->unified_mt.wal_generation, 1, memory_order_relaxed) + 1; + + /* we resolve skip list config with defaults */ + const int umt_max_level = db->config.unified_memtable_skip_list_max_level > 0 + ? db->config.unified_memtable_skip_list_max_level + : TDB_SKIP_LIST_MAX_LEVEL; + const float umt_probability = db->config.unified_memtable_skip_list_probability > 0.0f + ? db->config.unified_memtable_skip_list_probability + : TDB_SKIP_LIST_PROBABILITY; + /* the unified WAL is opened without block-manager self-sync; durability is owned by + * the commit-path group fsync (FULL) or the sync worker (INTERVAL) */ + const int umt_sync_mode = BLOCK_MANAGER_SYNC_NONE; + + skip_list_t *new_sl = NULL; + if (skip_list_new_with_arena(&new_sl, umt_max_level, umt_probability, + skip_list_comparator_memcmp, NULL, &db->cached_current_time, + db->unified_mt.write_buffer_size * 2) != 0) + { + return TDB_ERR_MEMORY; + } + + char uwal_path[TDB_MAX_PATH_LEN]; + snprintf(uwal_path, sizeof(uwal_path), + "%s" PATH_SEPARATOR TDB_UNIFIED_WAL_PREFIX TDB_U64_FMT TDB_WAL_EXT, db->db_path, + TDB_U64_CAST(new_gen)); + + block_manager_t *new_wal = NULL; + if (block_manager_open(&new_wal, uwal_path, umt_sync_mode) != 0 || + block_manager_truncate(new_wal) != 0) + { + if (new_wal) block_manager_close(new_wal); + skip_list_free(new_sl); + return TDB_ERR_IO; + } + + /* we sync db directory to persist new unified WAL file entry */ + tdb_sync_directory(db->db_path); + + tidesdb_memtable_t *new_mt = malloc(sizeof(tidesdb_memtable_t)); + if (!new_mt) + { + block_manager_close(new_wal); + skip_list_free(new_sl); + return TDB_ERR_MEMORY; + } + new_mt->skip_list = new_sl; + new_mt->wal = new_wal; + new_mt->id = 0; + new_mt->generation = new_gen; + atomic_init(&new_mt->refcount, 1); + atomic_init(&new_mt->writers, 0); + atomic_init(&new_mt->flushed, 0); + + /* we swap active, now old becomes immutable */ + atomic_store_explicit(&db->unified_mt.active, new_mt, memory_order_release); + + /* we enqueue old to immutable queue (for read path scanning) */ + queue_enqueue(db->unified_mt.immutables, old_mt); + + /* we enqueue flush work item with cf=NULL to signal unified flush */ + tidesdb_flush_work_t *uwork = malloc(sizeof(tidesdb_flush_work_t)); + if (uwork) + { + uwork->cf = NULL; /* NULL cf signals unified flush */ + uwork->imm = old_mt; + uwork->sst_id = new_gen; + uwork->unified_sl = NULL; + uwork->unified_barrier = NULL; + atomic_fetch_add_explicit(&db->flush_pending_count, 1, memory_order_release); + if (queue_enqueue(db->flush_queue, uwork) != 0) + { + free(uwork); + atomic_fetch_sub_explicit(&db->flush_pending_count, 1, memory_order_release); + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to enqueue unified flush work"); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified memtable rotated (gen=%" PRIu64 ", WAL=%s)", new_gen, + uwal_path); + + /* we reset WAL sync tracker since the new WAL starts empty */ + db->last_wal_sync_size = 0; + + return TDB_SUCCESS; +} + +int tidesdb_txn_commit(tidesdb_txn_t *txn) +{ + if (!txn || txn->is_committed || txn->is_aborted) return TDB_ERR_INVALID_ARGS; + + /* validate */ + if (txn->num_ops > 0) + { + if (txn->num_cfs <= 0 || txn->num_ops > TDB_MAX_TXN_OPS) return TDB_ERR_INVALID_ARGS; + } + + /* read-only fast path */ + if (txn->num_ops == 0 && txn->isolation_level < TDB_ISOLATION_REPEATABLE_READ) + { + txn->is_committed = 1; + return TDB_SUCCESS; + } + + /*** we skip all conflict checks for READ_UNCOMMITTED and READ_COMMITTED + ** read conflicts require REPEATABLE_READ+, write conflicts require SNAPSHOT+, + * SSI conflicts require SERIALIZABLE -- none apply at lower isolation levels */ + int result; + if (txn->isolation_level > TDB_ISOLATION_READ_COMMITTED) + { + result = tidesdb_txn_check_read_conflicts(txn); + if (result != TDB_SUCCESS) return result; + + result = tidesdb_txn_check_write_conflicts(txn); + if (result != TDB_SUCCESS) return result; + + result = tidesdb_txn_check_ssi_conflicts(txn); + if (result != TDB_SUCCESS) return result; + } + + txn->commit_seq = atomic_fetch_add_explicit(&txn->db->global_seq, 1, memory_order_relaxed); + tidesdb_commit_status_mark(txn->db->commit_status, txn->commit_seq, + TDB_COMMIT_STATUS_IN_PROGRESS); + + /* with the unified path, we do single WAL + single skip list */ + if (txn->db->unified_mt.enabled) + { + for (int cf_idx = 0; cf_idx < txn->num_cfs; cf_idx++) + { + result = tidesdb_apply_backpressure(txn->cfs[cf_idx]); + if (result != TDB_SUCCESS) return result; + } + + /* we load + try_ref + revalidate active so a rotation that fires between our load + * and try_ref cannot leave us holding a retired memtable. without the revalidate + * the flush worker can race ahead and close umt->wal under our feet */ + tidesdb_memtable_t *umt = NULL; + int umt_attempts = 0; + for (;;) + { + if (!tidesdb_active_memtable_try_ref(&txn->db->unified_mt.active_mt_readers, + &txn->db->unified_mt.active, &umt)) + { + if (++umt_attempts >= TDB_ACTIVE_REF_MAX_ATTEMPTS) return TDB_ERR_UNKNOWN; + continue; + } + /* mark this writer in-flight before the revalidate, mirroring the + * try_ref order, so a flush worker that drains writers cannot miss a + * writer that has already committed to mutating this memtable. the + * flush worker drains writers rather than refcount so readers cannot + * stall it -- see tidesdb_unified_flush_immutable */ + atomic_fetch_add_explicit(&umt->writers, 1, memory_order_acq_rel); + if (umt == atomic_load_explicit(&txn->db->unified_mt.active, memory_order_acquire)) + break; + atomic_fetch_sub_explicit(&umt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + if (++umt_attempts >= TDB_ACTIVE_REF_MAX_ATTEMPTS) return TDB_ERR_UNKNOWN; + } + + /* we serialize unified WAL batch */ + uint8_t uwal_stack_buf[TDB_WAL_STACK_BUFFER_SIZE]; + size_t uwal_size = 0; + uint8_t *uwal_batch = tidesdb_txn_serialize_wal_unified(txn, &uwal_size, uwal_stack_buf, + sizeof(uwal_stack_buf)); + if (!uwal_batch && uwal_size > 0) + { + atomic_fetch_sub_explicit(&umt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + return TDB_ERR_MEMORY; + } + + /** we write to unified WAL using raw write to avoid malloc/memcpy/free + * per commit. the wal_batch buffer (stack or heap) is written directly. */ + if (uwal_batch && umt->wal) + { + int64_t wal_result = block_manager_write_raw(umt->wal, uwal_batch, (uint32_t)uwal_size); + if (wal_result < 0) + { + if (uwal_batch != uwal_stack_buf) free(uwal_batch); + atomic_fetch_sub_explicit(&umt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + return TDB_ERR_IO; + } + } + + if (uwal_batch && uwal_batch != uwal_stack_buf) free(uwal_batch); + + /* group-commit durability -- one fdatasync per batch of concurrent committers. + * runs while writers is still held so a rotation cannot swap this WAL out from under + * us. only when configured FULL; INTERVAL is handled by the sync worker, NONE skips. */ + if (txn->db->config.unified_memtable_sync_mode == TDB_SYNC_FULL && umt->wal) + { + const uint64_t my_end = + atomic_load_explicit(&umt->wal->current_file_size, memory_order_acquire); + if (tidesdb_unified_wal_group_sync(txn->db, umt->wal, my_end) != 0) + { + atomic_fetch_sub_explicit(&umt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + return TDB_ERR_IO; + } + } + + /* sync-on-commit WAL upload for RPO=0 replication */ + if (txn->db->object_store && txn->db->config.object_store_config && + txn->db->config.object_store_config->wal_sync_on_commit && umt->wal) + { + tdb_objstore_upload_file_sync(txn->db, umt->wal->file_path); + } + + /* we apply ops to unified skip list with prefixed keys */ + result = tidesdb_txn_apply_ops_to_unified_memtable(txn, umt->skip_list); + if (result != TDB_SUCCESS) + { + atomic_fetch_sub_explicit(&umt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + return result; + } + + /* we check if unified memtable needs rotation */ + const size_t umt_size = (size_t)skip_list_get_size(umt->skip_list); + atomic_fetch_sub_explicit(&umt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + + if (umt_size >= txn->db->unified_mt.write_buffer_size) + { + /** CAS-based admission, only one thread enters rotation at a time + * same lock-free pattern as per-CF flush in tidesdb_flush_memtable_internal */ + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&txn->db->unified_mt.is_flushing, &expected, + 1, memory_order_acquire, + memory_order_relaxed)) + { + /* we re-check under CAS (another thread may have rotated before us) */ + tidesdb_memtable_t *cur = + atomic_load_explicit(&txn->db->unified_mt.active, memory_order_acquire); + if (cur == umt) + { + int rot_rc = tidesdb_unified_memtable_rotate(txn->db); + if (rot_rc != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Unified memtable rotation failed (error %d)", + rot_rc); + } + } + atomic_store_explicit(&txn->db->unified_mt.is_flushing, 0, memory_order_release); + } + } + + txn->is_committed = 1; + atomic_thread_fence(memory_order_seq_cst); + tidesdb_commit_status_mark(txn->db->commit_status, txn->commit_seq, + TDB_COMMIT_STATUS_COMMITTED); + tidesdb_txn_remove_from_active_list(txn); + + /* we invoke commit hooks */ + for (int cf_idx = 0; cf_idx < txn->num_cfs; cf_idx++) + { + tidesdb_column_family_t *cf = txn->cfs[cf_idx]; + if (!cf || !cf->config.commit_hook_fn) continue; + + int hook_op_count = 0; + for (int i = 0; i < txn->num_ops; i++) + { + if (txn->ops[i].cf == cf) hook_op_count++; + } + if (hook_op_count == 0) continue; + + tidesdb_commit_op_t stack_hook_ops[TDB_STACK_COMMIT_HOOK_OPS]; + tidesdb_commit_op_t *hook_ops = + hook_op_count <= TDB_STACK_COMMIT_HOOK_OPS + ? stack_hook_ops + : malloc(hook_op_count * sizeof(tidesdb_commit_op_t)); + if (!hook_ops) continue; + + int idx = 0; + for (int i = 0; i < txn->num_ops; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + hook_ops[idx].key = op->key; + hook_ops[idx].key_size = op->key_size; + hook_ops[idx].value = op->value; + hook_ops[idx].value_size = op->value_size; + hook_ops[idx].ttl = op->ttl; + hook_ops[idx].is_delete = op->is_delete; + idx++; + } + cf->config.commit_hook_fn(hook_ops, hook_op_count, txn->commit_seq, + cf->config.commit_hook_ctx); + if (hook_ops != stack_hook_ops) free(hook_ops); + } + + return TDB_SUCCESS; + } + + /* stack-allocate for common case (≤N CFs) to avoid malloc/free per transaction */ +#define TDB_TXN_COMMIT_STACK_CFS 4 + tidesdb_memtable_t *stack_memtables[TDB_TXN_COMMIT_STACK_CFS]; + skip_list_t *stack_skiplists[TDB_TXN_COMMIT_STACK_CFS]; + const size_t alloc_size = txn->num_cfs > 0 ? txn->num_cfs : 1; + const int use_stack_cf = ((int)alloc_size <= TDB_TXN_COMMIT_STACK_CFS); + tidesdb_memtable_t **cf_memtables; + skip_list_t **cf_skiplists; + + if (use_stack_cf) + { + cf_memtables = stack_memtables; + cf_skiplists = stack_skiplists; + memset(cf_memtables, 0, alloc_size * sizeof(tidesdb_memtable_t *)); + memset(cf_skiplists, 0, alloc_size * sizeof(skip_list_t *)); + } + else + { + cf_memtables = calloc(alloc_size, sizeof(tidesdb_memtable_t *)); + cf_skiplists = calloc(alloc_size, sizeof(skip_list_t *)); + if (!cf_memtables || !cf_skiplists) + { + free(cf_memtables); + free(cf_skiplists); + return TDB_ERR_MEMORY; + } + } + + /* we apply backpressure before acquiring any memtable reference. a writer that + * stalls in apply_backpressure must not hold a memtable writers/refcount -- + * the flush worker drains an immutable's writers before flushing it, so a + * stalled writer holding a rotated memtable would block the flush, the flush + * would never drain the immutable queue, and the stall would never clear */ + for (int cf_idx = 0; cf_idx < txn->num_cfs; cf_idx++) + { + result = tidesdb_apply_backpressure(txn->cfs[cf_idx]); + if (result != TDB_SUCCESS) + { + if (!use_stack_cf) + { + free(cf_memtables); + free(cf_skiplists); + } + return result; + } + } + + /****** we use a single loop for WAL write + memtable apply to close the race window + ***** where another thread could flush the memtable between WAL write and op apply. + **** previously two separate loops meant ops for CF[1] could be applied to an + *** immutable memtable whose flush worker already finished reading the skip list, + ** causing committed data loss. ref release and flush trigger are deferred to a + * second pass to avoid triggering flushes while holding refs to other CFs. */ + for (int cf_idx = 0; cf_idx < txn->num_cfs; cf_idx++) + { + tidesdb_column_family_t *cf = txn->cfs[cf_idx]; + + /*** we load + try_ref + writers-bump + revalidate the active memtable. + ** try_ref (CAS) refuses a memtable already claimed for cleanup. the + * writers bump marks this commit in-flight before the revalidate so the + * flush worker, which drains writers, cannot miss a writer that has + * committed to mutating this memtable. the seq_cst fence pairs with the + * one in tidesdb_flush_memtable_internal after it publishes the new + * active, so a memtable rotated under us is abandoned -- we retry on the + * new active rather than mutating a skip list the flush worker has + * already started reading. */ + tidesdb_memtable_t *mt = NULL; + int acquire_attempts = 0; + for (;;) + { + if (tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, &mt)) + { + atomic_fetch_add_explicit(&mt->writers, 1, memory_order_acq_rel); + atomic_thread_fence(memory_order_seq_cst); + /* a rename or drop that set marked_for_deletion drains writers + * before closing this cf's WAL. the seq_cst fence above pairs + * with the one the ddl runs before its drain, so if we miss the + * flag the ddl is guaranteed to see our writers bump and wait. + * backing off here keeps the drain bounded and stops us writing + * through a WAL handle the ddl is about to close */ + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) + { + atomic_fetch_sub_explicit(&mt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&mt->refcount, 1, memory_order_release); + result = TDB_ERR_NOT_FOUND; + goto cleanup; + } + if (mt == atomic_load_explicit(&cf->active_memtable, memory_order_acquire)) break; + atomic_fetch_sub_explicit(&mt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&mt->refcount, 1, memory_order_release); + } + if (++acquire_attempts >= TDB_ACTIVE_REF_MAX_ATTEMPTS) + { + /* active is rotating faster than we can latch it -- fail the + * commit; cleanup releases the memtables latched for earlier CFs */ + result = TDB_ERR_UNKNOWN; + goto cleanup; + } + } + cf_memtables[cf_idx] = mt; + cf_skiplists[cf_idx] = mt->skip_list; + + /* stack buffer for small WAL payloads; this essentially avoids malloc/free per txn */ + uint8_t wal_stack_buf[TDB_WAL_STACK_BUFFER_SIZE]; + size_t wal_size = 0; + uint8_t *wal_batch = + tidesdb_txn_serialize_wal(txn, cf, &wal_size, wal_stack_buf, sizeof(wal_stack_buf)); + + if (!wal_batch) + { + if (wal_size > 0) + { + goto cleanup_error_memory; + } + continue; + } + + const int wal_is_heap = (wal_batch != wal_stack_buf); + + block_manager_t *wal = mt ? mt->wal : NULL; + if (wal) + { + int64_t wal_result = block_manager_write_raw(wal, wal_batch, (uint32_t)wal_size); + if (wal_result < 0) + { + if (wal_is_heap) free(wal_batch); + goto cleanup_error_io; + } + } + + if (wal_is_heap) free(wal_batch); + + /****** we apply ops to memtable immediately after WAL write to ensure entries + ***** are visible in the skip list before any concurrent flush can read it. + **** this closes the race where another thread flushes this CF's memtable + *** between our WAL write and op apply, causing the flush worker to + ** serialize the skip list without our entries. */ + if (mt) + { + result = tidesdb_txn_apply_ops_to_memtable(txn, cf, cf_skiplists[cf_idx]); + if (result != TDB_SUCCESS) + { + goto cleanup_error_result; + } + } + } + + /**** second pass is we release refs and trigger flushes. deferred from the first loop + *** because flush can block on backpressure and we don't want to hold refs + ** to other CFs' memtables while waiting. */ + for (int cf_idx = 0; cf_idx < txn->num_cfs; cf_idx++) + { + tidesdb_memtable_t *mt = cf_memtables[cf_idx]; + if (!mt) continue; + + tidesdb_column_family_t *cf = txn->cfs[cf_idx]; + skip_list_t *memtable = cf_skiplists[cf_idx]; + + const size_t memtable_size = (size_t)skip_list_get_size(memtable); + + /****** we use adaptive flush headroom based on L0 queue pressure and global memory pressure + ***** idle (queue empty) 50% headroom for max batching + **** moderate (1-2 pending) 25% headroom (proven baseline) + *** high (>=50% stall threshold) 0% headroom, flush immediately + ** global elevated+ 0% headroom, flush at exact write_buffer_size + * half_stall uses the multi-CF scaled effective stall so the tier boundary + * matches the threshold apply_backpressure enforces */ + const size_t l0_depth = queue_size(cf->immutable_memtables); + const size_t effective_stall = tdb_cf_effective_stall(cf); + const size_t half_stall = effective_stall / 2; + const int global_pressure = + cf->db ? atomic_load_explicit(&cf->db->memory_pressure_level, memory_order_relaxed) + : TDB_MEMORY_PRESSURE_NORMAL; + size_t flush_threshold; + if (global_pressure >= TDB_MEMORY_PRESSURE_ELEVATED || + (half_stall > 0 && l0_depth >= half_stall)) + { + flush_threshold = cf->config.write_buffer_size; + } + else if (l0_depth == 0) + { + flush_threshold = cf->config.write_buffer_size + (cf->config.write_buffer_size / 2); + } + else + { + flush_threshold = cf->config.write_buffer_size + (cf->config.write_buffer_size / 4); + } + const int needs_flush = (memtable_size >= flush_threshold); + + atomic_fetch_sub_explicit(&mt->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&mt->refcount, 1, memory_order_release); + cf_memtables[cf_idx] = NULL; /* mark as released */ + + if (needs_flush) + { + tidesdb_flush_memtable(cf); + } + } + + if (!use_stack_cf) + { + free(cf_memtables); + free(cf_skiplists); + } + + txn->is_committed = 1; + atomic_thread_fence(memory_order_seq_cst); + tidesdb_commit_status_mark(txn->db->commit_status, txn->commit_seq, + TDB_COMMIT_STATUS_COMMITTED); + tidesdb_txn_remove_from_active_list(txn); + + /*** we invoke commit hooks for each CF that has one registered + ** hooks fire after commit is fully durable (WAL + memtable + commit status) + * hook failure is logged but does not affect the commit result */ + for (int cf_idx = 0; cf_idx < txn->num_cfs; cf_idx++) + { + tidesdb_column_family_t *cf = txn->cfs[cf_idx]; + if (!cf || !cf->config.commit_hook_fn) continue; + + /* we count ops for this CF */ + int hook_op_count = 0; + for (int i = 0; i < txn->num_ops; i++) + { + if (txn->ops[i].cf == cf) hook_op_count++; + } + if (hook_op_count == 0) continue; + + /* we use stack allocation for common case (small txns) */ +#define TDB_COMMIT_HOOK_STACK_OPS 16 + tidesdb_commit_op_t stack_hook_ops[TDB_COMMIT_HOOK_STACK_OPS]; + tidesdb_commit_op_t *hook_ops; + const int hook_use_stack = (hook_op_count <= TDB_COMMIT_HOOK_STACK_OPS); + + if (hook_use_stack) + { + hook_ops = stack_hook_ops; + } + else + { + hook_ops = malloc(hook_op_count * sizeof(tidesdb_commit_op_t)); + if (!hook_ops) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Failed to allocate commit hook ops for CF '%s' (count=%d)", cf->name, + hook_op_count); + continue; + } + } + + int idx = 0; + for (int i = 0; i < txn->num_ops; i++) + { + const tidesdb_txn_op_t *op = &txn->ops[i]; + if (op->cf != cf) continue; + + hook_ops[idx].key = op->key; + hook_ops[idx].key_size = op->key_size; + hook_ops[idx].value = op->value; + hook_ops[idx].value_size = op->value_size; + hook_ops[idx].ttl = op->ttl; + hook_ops[idx].is_delete = op->is_delete; + idx++; + } + + const int hook_result = cf->config.commit_hook_fn(hook_ops, hook_op_count, txn->commit_seq, + cf->config.commit_hook_ctx); + if (hook_result != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Commit hook for CF '%s' returned error %d (seq=%" PRIu64 ")", cf->name, + hook_result, txn->commit_seq); + } + + if (!hook_use_stack) free(hook_ops); + } + + return TDB_SUCCESS; + +cleanup_error_memory: + result = TDB_ERR_MEMORY; + goto cleanup; + +cleanup_error_io: + result = TDB_ERR_IO; + goto cleanup; + +cleanup_error_result: + /* result already set */ + goto cleanup; + +cleanup: + for (int i = 0; i < txn->num_cfs; i++) + { + if (cf_memtables[i]) + { + atomic_fetch_sub_explicit(&cf_memtables[i]->writers, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf_memtables[i]->refcount, 1, memory_order_release); + } + } + if (!use_stack_cf) + { + free(cf_memtables); + free(cf_skiplists); + } + return result; +} + +int tidesdb_txn_savepoint(tidesdb_txn_t *txn, const char *name) +{ + if (!txn || !name || txn->is_committed || txn->is_aborted) return TDB_ERR_INVALID_ARGS; + + /* we check if savepoint with this name already exists */ + for (int i = 0; i < txn->num_savepoints; i++) + { + if (strcmp(txn->savepoint_names[i], name) == 0) + { + /** we update existing savepoint -- just record current counts + * ops array is append-only so this is all we need */ + txn->savepoint_op_counts[i] = txn->num_ops; + txn->savepoint_cf_counts[i] = txn->num_cfs; + return TDB_SUCCESS; + } + } + + if (txn->num_savepoints >= txn->savepoints_capacity) + { + const int new_capacity = txn->savepoints_capacity == 0 ? 4 : txn->savepoints_capacity * 2; + int *new_op_counts = realloc(txn->savepoint_op_counts, new_capacity * sizeof(int)); + int *new_cf_counts = realloc(txn->savepoint_cf_counts, new_capacity * sizeof(int)); + char **new_names = realloc(txn->savepoint_names, new_capacity * sizeof(char *)); + if (!new_op_counts || !new_cf_counts || !new_names) + { + /* we only update pointers that succeeded */ + if (new_op_counts) txn->savepoint_op_counts = new_op_counts; + if (new_cf_counts) txn->savepoint_cf_counts = new_cf_counts; + if (new_names) txn->savepoint_names = new_names; + return TDB_ERR_MEMORY; + } + txn->savepoint_op_counts = new_op_counts; + txn->savepoint_cf_counts = new_cf_counts; + txn->savepoint_names = new_names; + txn->savepoints_capacity = new_capacity; + } + + /** we record current op/cf counts as the savepoint checkpoint + * since ops are append-only, rollback just truncates back to this point */ + txn->savepoint_op_counts[txn->num_savepoints] = txn->num_ops; + txn->savepoint_cf_counts[txn->num_savepoints] = txn->num_cfs; + txn->savepoint_names[txn->num_savepoints] = tdb_strdup(name); + if (!txn->savepoint_names[txn->num_savepoints]) + { + return TDB_ERR_MEMORY; + } + txn->num_savepoints++; + + return TDB_SUCCESS; +} + +int tidesdb_txn_rollback_to_savepoint(tidesdb_txn_t *txn, const char *name) +{ + if (!txn || !name || txn->num_savepoints == 0 || txn->is_committed || txn->is_aborted) + return TDB_ERR_INVALID_ARGS; + + int savepoint_idx = -1; + for (int i = 0; i < txn->num_savepoints; i++) + { + if (strcmp(txn->savepoint_names[i], name) == 0) + { + savepoint_idx = i; + break; + } + } + + if (savepoint_idx == -1) return TDB_ERR_NOT_FOUND; + + const int saved_num_ops = txn->savepoint_op_counts[savepoint_idx]; + const int saved_num_cfs = txn->savepoint_cf_counts[savepoint_idx]; + + /* we free ops appended after the savepoint */ + int64_t freed_bytes = 0; + for (int i = saved_num_ops; i < txn->num_ops; i++) + { + freed_bytes += (int64_t)(txn->ops[i].key_size + txn->ops[i].value_size); + free(txn->ops[i].key); /* coalesced buffer owns key+value */ + } + txn->mem_bytes -= freed_bytes; + tidesdb_txn_mem_publish(txn); + + /* we truncate back to savepoint */ + txn->num_ops = saved_num_ops; + txn->num_cfs = saved_num_cfs; + + /* the last-cf cache may point at a cf that the truncation just dropped from + * cfs[0..num_cfs); clearing it forces add_cf_internal to rescan and re-register + * that cf on the next op instead of fast-pathing to an out-of-range index whose + * ops commit never iterates */ + txn->last_cf = NULL; + txn->last_cf_index = 0; + + /* we invalidate the write set hash since indices may now be stale */ + if (txn->write_set_hash) + { + tidesdb_write_set_hash_free((tidesdb_write_set_hash_t *)txn->write_set_hash); + txn->write_set_hash = NULL; + } + + /* we remove all savepoints from savepoint_idx onwards (invalidate later savepoints) */ + for (int i = savepoint_idx; i < txn->num_savepoints; i++) + { + free(txn->savepoint_names[i]); + } + txn->num_savepoints = savepoint_idx; + + return TDB_SUCCESS; +} + +int tidesdb_txn_release_savepoint(tidesdb_txn_t *txn, const char *name) +{ + if (!txn || !name || txn->num_savepoints == 0 || txn->is_committed || txn->is_aborted) + return TDB_ERR_INVALID_ARGS; + + /* we find savepoint by name */ + int savepoint_idx = -1; + for (int i = 0; i < txn->num_savepoints; i++) + { + if (strcmp(txn->savepoint_names[i], name) == 0) + { + savepoint_idx = i; + break; + } + } + + if (savepoint_idx == -1) return TDB_ERR_NOT_FOUND; + + /* we free the savepoint name without rolling back */ + free(txn->savepoint_names[savepoint_idx]); + + /* we shift remaining savepoints down */ + for (int i = savepoint_idx; i < txn->num_savepoints - 1; i++) + { + txn->savepoint_op_counts[i] = txn->savepoint_op_counts[i + 1]; + txn->savepoint_cf_counts[i] = txn->savepoint_cf_counts[i + 1]; + txn->savepoint_names[i] = txn->savepoint_names[i + 1]; + } + txn->num_savepoints--; + + return TDB_SUCCESS; +} + +/** + * tidesdb_iter_kv_visible + * check if a KV pair should be visible to the iterator based on: + * isolation level + * TTL expiration + * tombstone flag + * @param iter iterator + * @param kv KV pair + * @return 1 if visible, 0 if should be skipped, -1 if tombstone (skip all versions of this key) + */ +static int tidesdb_iter_kv_visible(tidesdb_iter_t *iter, tidesdb_kv_pair_t *kv) +{ + if (!iter || !kv) return 0; + + /*** we check sequence visibility first (before tombstone check) + ** entries from our own transaction write buffer use seq=UINT64_MAX + * these are always visible to the owning transaction (read-your-own-writes) */ + const int seq_visible = (kv->entry.seq == UINT64_MAX) || (kv->entry.seq <= iter->cf_snapshot); + + if (!seq_visible) + { + return 0; /* not visible due to isolation level */ + } + + /** we now check if it's a tombstone -- if visible tombstone, return -1 to signal + * that all versions of this key should be skipped */ + if (kv->entry.flags & TDB_KV_FLAG_TOMBSTONE) + { + return -1; /* tombstone -- we skip all versions of this key */ + } + + if (kv->entry.ttl > 0 && kv->entry.ttl < iter->snapshot_time) + { + return 0; + } + + return 1; +} + +int tidesdb_iter_new(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, tidesdb_iter_t **iter) +{ + if (!txn || !cf || !iter) return TDB_ERR_INVALID_ARGS; + + const int cf_index = tidesdb_txn_add_cf_internal(txn, cf); + if (cf_index < 0) return TDB_ERR_MEMORY; + + *iter = calloc(1, sizeof(tidesdb_iter_t)); + if (!*iter) return TDB_ERR_MEMORY; + + (*iter)->cf = cf; + (*iter)->txn = txn; + (*iter)->valid = 0; + (*iter)->direction = 0; + (*iter)->snapshot_time = atomic_load(&txn->db->cached_current_time); + (*iter)->cached_sources = NULL; + (*iter)->num_cached_sources = 0; + (*iter)->cached_sources_capacity = 0; + + /* we create merge heap for this CF */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + + (*iter)->heap = tidesdb_merge_heap_create(comparator_fn, comparator_ctx); + if (!(*iter)->heap) + { + free(*iter); + return TDB_ERR_MEMORY; + } + + /* we enable double-buffered pop arena to avoid malloc during borrowed KV + * materialization in merge_heap_pop. each buffer holds one materialized + * result; the iterator toggles between them so prev and current never + * share the same slot. */ + (*iter)->heap->pop_buf[0] = malloc(TDB_MERGE_POP_BUF_INITIAL_CAP); + (*iter)->heap->pop_buf[1] = malloc(TDB_MERGE_POP_BUF_INITIAL_CAP); + (*iter)->heap->pop_buf_cap[0] = (*iter)->heap->pop_buf[0] ? TDB_MERGE_POP_BUF_INITIAL_CAP : 0; + (*iter)->heap->pop_buf_cap[1] = (*iter)->heap->pop_buf[1] ? TDB_MERGE_POP_BUF_INITIAL_CAP : 0; + (*iter)->heap->pop_buf_slot = 0; + + size_t imm_count = 0; + tidesdb_immutable_memtable_t **imm_snapshot = + tidesdb_snapshot_immutable_memtables(cf, &imm_count); + + /*** we pin the active memtable to prevent use-after-free if rotation + + * flush races between our load and merge_source_from_memtable's ref. + ** the helper bumps active_mt_readers across the load + try_ref so the + *** cleanup loop cannot free the struct between them. */ + tidesdb_memtable_t *active_mt_struct = NULL; + if (!tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, + &active_mt_struct)) + { + /* rotation raced with our load, we retry once */ + (void)tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, + &active_mt_struct); + } + skip_list_t *active_mt = + (active_mt_struct && active_mt_struct->skip_list) ? active_mt_struct->skip_list : NULL; + + /* we ensure consistent view */ + atomic_thread_fence(memory_order_acquire); + + if (txn->isolation_level == TDB_ISOLATION_READ_COMMITTED) + { + uint64_t current_seq = atomic_load_explicit(&cf->db->global_seq, memory_order_acquire); + (*iter)->cf_snapshot = (current_seq > 0) ? current_seq - 1 : 0; + } + else + { + (*iter)->cf_snapshot = txn->snapshot_seq; + } + + const int has_unified = txn->db->unified_mt.enabled ? 1 : 0; + + /* snapshot unified_mt.immutables under its rdlock so a rotation that fires between + * our size-read and our walk cannot leave the newest immutable invisible. the size + * is stable while we hold the lock; allocating inside the lock is brief */ + tidesdb_memtable_t *unified_imm_stack[TDB_STACK_IMM_SNAPSHOT]; + tidesdb_memtable_t **unified_imm_snap = unified_imm_stack; + size_t unified_imm_snap_count = 0; + if (has_unified && txn->db->unified_mt.immutables) + { + queue_t *uimm_q = txn->db->unified_mt.immutables; + pthread_rwlock_rdlock(&uimm_q->read_lock); + const size_t actual = atomic_load_explicit(&uimm_q->size, memory_order_relaxed); + if (actual > 0) + { + if (actual > TDB_STACK_IMM_SNAPSHOT) + { + tidesdb_memtable_t **heap_arr = malloc(actual * sizeof(tidesdb_memtable_t *)); + if (heap_arr) unified_imm_snap = heap_arr; + } + const size_t cap = + (unified_imm_snap == unified_imm_stack) ? TDB_STACK_IMM_SNAPSHOT : actual; + queue_node_t *cur = uimm_q->head->next; + for (; cur != NULL && unified_imm_snap_count < cap; cur = cur->next) + { + /* we pin each immutable so a concurrent flush-worker eviction + * cannot free it before the merge source takes its own ref */ + tidesdb_memtable_t *uimm = (tidesdb_memtable_t *)cur->data; + unified_imm_snap[unified_imm_snap_count++] = + tidesdb_memtable_try_ref(uimm) ? uimm : NULL; + } + } + pthread_rwlock_unlock(&uimm_q->read_lock); + } + + const int mt_capacity = + 2 + (int)imm_count + (txn->num_ops > 0 ? 1 : 0) + has_unified + (int)unified_imm_snap_count; + (*iter)->cached_mt_sources = malloc(mt_capacity * sizeof(tidesdb_merge_source_t *)); + (*iter)->num_cached_mt_sources = 0; + + if ((*iter)->cached_mt_sources) + { + tidesdb_merge_source_t *memtable_source = NULL; + if (active_mt_struct && active_mt) + { + memtable_source = + tidesdb_merge_source_from_memtable(active_mt, &cf->config, active_mt_struct); + } + /* release our try_ref pin -- merge_source_from_memtable took its own ref */ + if (active_mt_struct) tidesdb_immutable_memtable_unref(active_mt_struct); + + if (memtable_source) + { + memtable_source->is_cached = 1; + ((tidesdb_merge_source_t **)(*iter) + ->cached_mt_sources)[(*iter)->num_cached_mt_sources++] = memtable_source; + + if (memtable_source->current_kv != NULL) + { + tidesdb_merge_heap_add_source((*iter)->heap, memtable_source); + } + } + + /***** in unified memtable mode, we add the shared skip list as a merge source + **** with CF-prefix filtering so iterator only sees this CF's entries. + *** we use try_ref to safely pin the unified memtable before creating the + ** cursor, preventing use-after-free if the memtable rotates between our + * atomic_load and the source creation's internal ref call. */ + if (txn->db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = NULL; + if (!tidesdb_active_memtable_try_ref(&txn->db->unified_mt.active_mt_readers, + &txn->db->unified_mt.active, &umt)) + { + /* we retry once if rotation raced with our load */ + (void)tidesdb_active_memtable_try_ref(&txn->db->unified_mt.active_mt_readers, + &txn->db->unified_mt.active, &umt); + } + if (umt && umt->skip_list) + { + tidesdb_merge_source_t *unified_source = tidesdb_merge_source_from_unified_memtable( + umt->skip_list, &cf->config, umt, cf->unified_cf_index); + /* source creation adds its own ref via imm, we release our try_ref */ + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + if (unified_source) + { + unified_source->is_cached = 1; + ((tidesdb_merge_source_t **)(*iter) + ->cached_mt_sources)[(*iter)->num_cached_mt_sources++] = unified_source; + + if (unified_source->current_kv != NULL) + { + tidesdb_merge_heap_add_source((*iter)->heap, unified_source); + } + } + } + else if (umt) + { + /* try_ref succeeded but no skip_list, thus we release */ + atomic_fetch_sub_explicit(&umt->refcount, 1, memory_order_release); + } + + /* we add unified immutables (rotated but not yet fully flushed to per-cf + * sstables) so scans see the same data tidesdb_txn_get sees. without this + * keys committed shortly before a scan are invisible until the per-cf + * flush completes */ + for (size_t qi = 0; qi < unified_imm_snap_count; qi++) + { + tidesdb_memtable_t *uimm = unified_imm_snap[qi]; + if (!uimm) continue; + if (!uimm->skip_list || atomic_load_explicit(&uimm->flushed, memory_order_acquire)) + { + tidesdb_immutable_memtable_unref(uimm); + continue; + } + + tidesdb_merge_source_t *uimm_source = tidesdb_merge_source_from_unified_memtable( + uimm->skip_list, &cf->config, uimm, cf->unified_cf_index); + /* the merge source took its own ref on uimm; release our pin */ + tidesdb_immutable_memtable_unref(uimm); + if (!uimm_source) continue; + uimm_source->is_cached = 1; + ((tidesdb_merge_source_t **)(*iter) + ->cached_mt_sources)[(*iter)->num_cached_mt_sources++] = uimm_source; + + if (uimm_source->current_kv != NULL) + { + tidesdb_merge_heap_add_source((*iter)->heap, uimm_source); + } + } + } + + /** we add transaction write buffer as a merge source for read-your-own-ops + * this allows iterators to see uncommitted puts/deletes from the owning txn */ + if (txn->num_ops > 0) + { + tidesdb_merge_source_t *txn_ops_source = + tidesdb_merge_source_from_txn_ops(txn, cf, &cf->config); + if (txn_ops_source) + { + txn_ops_source->is_cached = 1; + ((tidesdb_merge_source_t **)(*iter) + ->cached_mt_sources)[(*iter)->num_cached_mt_sources++] = txn_ops_source; + + if (txn_ops_source->current_kv != NULL) + { + tidesdb_merge_heap_add_source((*iter)->heap, txn_ops_source); + } + } + } + + /* we add immutables from our snapshot */ + if (imm_snapshot) + { + for (size_t i = 0; i < imm_count; i++) + { + tidesdb_immutable_memtable_t *imm = imm_snapshot[i]; + if (imm && imm->skip_list) + { + tidesdb_merge_source_t *source = + tidesdb_merge_source_from_memtable(imm->skip_list, &cf->config, imm); + if (source) + { + source->is_cached = 1; + ((tidesdb_merge_source_t **)(*iter) + ->cached_mt_sources)[(*iter)->num_cached_mt_sources++] = source; + + if (source->current_kv != NULL) + { + tidesdb_merge_heap_add_source((*iter)->heap, source); + } + } + + tidesdb_immutable_memtable_unref(imm); + } + } + free(imm_snapshot); + } + + if (unified_imm_snap != unified_imm_stack) free(unified_imm_snap); + } + else + { + /* the fallback is to add directly to heap if mt cache alloc failed */ + tidesdb_merge_source_t *memtable_source = + tidesdb_merge_source_from_memtable(active_mt, &cf->config, active_mt_struct); + if (memtable_source && memtable_source->current_kv != NULL) + { + if (tidesdb_merge_heap_add_source((*iter)->heap, memtable_source) != TDB_SUCCESS) + tidesdb_merge_source_free(memtable_source); + } + else if (memtable_source) + tidesdb_merge_source_free(memtable_source); + + if (imm_snapshot) + { + for (size_t i = 0; i < imm_count; i++) + { + tidesdb_immutable_memtable_t *imm = imm_snapshot[i]; + if (imm && imm->skip_list) + { + tidesdb_merge_source_t *source = + tidesdb_merge_source_from_memtable(imm->skip_list, &cf->config, imm); + if (source && source->current_kv != NULL) + { + if (tidesdb_merge_heap_add_source((*iter)->heap, source) != TDB_SUCCESS) + tidesdb_merge_source_free(source); + } + else if (source) + tidesdb_merge_source_free(source); + tidesdb_immutable_memtable_unref(imm); + } + } + free(imm_snapshot); + } + + /* the cache-alloc fallback consumes no unified immutables -- release the + * pins taken during the snapshot */ + for (size_t qi = 0; qi < unified_imm_snap_count; qi++) + { + if (unified_imm_snap[qi]) tidesdb_immutable_memtable_unref(unified_imm_snap[qi]); + } + if (unified_imm_snap != unified_imm_stack) free(unified_imm_snap); + } + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + int ssts_capacity = TDB_STACK_SSTS; + tidesdb_sstable_t **ssts_array = malloc(ssts_capacity * sizeof(tidesdb_sstable_t *)); + int sst_count = 0; + + if (ssts_array) + { + /* we iterate through levels and take refs immediately to minimize race */ + for (int i = 0; i < num_levels; i++) + { + tidesdb_level_t *level = cf->levels[i]; + int level_retries = 0; + + retry_level:; + /** we load array pointer and count with careful ordering to handle concurrent + * modifications re-load count to detect concurrent remove, we use minimum to avoid OOB + */ + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + tidesdb_sstable_t **sstables = + atomic_load_explicit(&level->sstables, memory_order_acquire); + int num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + /* we re-load count to detect concurrent remove */ + int num_ssts_recheck = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + if (num_ssts_recheck < num_ssts) num_ssts = num_ssts_recheck; + + /* we verify array hasnt changed */ + tidesdb_sstable_t **sstables_check = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (sstables_check != sstables) + { + sstables = sstables_check; + num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + } + + /* we track how many refs we had before this level to allow rollback on retry */ + const int sst_count_before_level = sst_count; + + /* we take refs on all sstables in this level immediately in tight loop + * this minimizes window where compaction could free the array */ + int need_retry = 0; + for (int j = 0; j < num_ssts; j++) + { + /*** we check if array changed before accessing, if so, our sstables pointer is + ** stale + */ + tidesdb_sstable_t **current_arr = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (current_arr != sstables) + { + /* the array was swapped, we release refs and retry with new array (bounded) */ + for (int k = sst_count_before_level; k < sst_count; k++) + { + tidesdb_sstable_unref(cf->db, ssts_array[k]); + } + sst_count = sst_count_before_level; + need_retry = 1; + break; + } + + tidesdb_sstable_t *sst = sstables[j]; + if (!sst) continue; + + if (sst_count >= ssts_capacity) + { + int new_capacity = ssts_capacity * 2; + tidesdb_sstable_t **new_array = + realloc(ssts_array, new_capacity * sizeof(tidesdb_sstable_t *)); + if (!new_array) + { + /* we cleanup refs taken so far */ + for (int k = 0; k < sst_count; k++) + { + tidesdb_sstable_unref(cf->db, ssts_array[k]); + } + free(ssts_array); + ssts_array = NULL; + break; + } + ssts_array = new_array; + ssts_capacity = new_capacity; + } + + /** we try to acquire reference to protect against concurrent deletion + * if try_ref fails, we check if array was swapped before deciding to retry */ + if (!tidesdb_sstable_try_ref(sst)) + { + tidesdb_sstable_t **current_ssts = + atomic_load_explicit(&level->sstables, memory_order_acquire); + + if (current_ssts != sstables) + { + /* array was swapped, we release refs and retry */ + for (int k = sst_count_before_level; k < sst_count; k++) + { + tidesdb_sstable_unref(cf->db, ssts_array[k]); + } + sst_count = sst_count_before_level; + need_retry = 1; + break; + } + + /* array unchanged, we skip dead sstable */ + continue; + } + ssts_array[sst_count++] = sst; + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + if (!ssts_array) break; /* allocation failed */ + if (need_retry) + { + if (level_retries < TDB_SST_RETRY_MAX_LEVEL_RETRIES) + { + level_retries++; + goto retry_level; + } + + /*** retries exhausted due to heavy concurrent compaction. we must take one + ** final pass that collects whatever ssts we can ref from the + * current array snapshot, ignoring further array swaps. a ref'd + * sst is always safe to read even after removal from the level. + ** skipping the level entirely would lose data that may not yet + *** appear in a lower level. */ + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + for (int j = 0; j < num_ssts; j++) + { + tidesdb_sstable_t *sst = sstables[j]; + if (sst && tidesdb_sstable_try_ref(sst)) + { + tidesdb_sstable_t **new_arr = + realloc(ssts_array, (sst_count + 1) * sizeof(tidesdb_sstable_t *)); + if (!new_arr) + { + tidesdb_sstable_unref(cf->db, sst); + break; + } + ssts_array = new_arr; + ssts_array[sst_count++] = sst; + } + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + } + } + } + + /* we cache sst sources for reuse across seeks */ + if (ssts_array) + { + (*iter)->cached_sources_capacity = sst_count; + (*iter)->cached_sources = malloc(sst_count * sizeof(tidesdb_merge_source_t *)); + if (!(*iter)->cached_sources) + { + for (int i = 0; i < sst_count; i++) + { + tidesdb_sstable_unref(cf->db, ssts_array[i]); + } + free(ssts_array); + tidesdb_merge_heap_free((*iter)->heap); + free(*iter); + return TDB_ERR_MEMORY; + } + + /*** we prefetch non-local sstable files in parallel before creating sources. + ** this downloads all frozen sstables concurrently so that the lazy source + * creation below finds files locally and avoids serial download stalls. */ + if (cf->db->object_store) + { + tdb_objstore_prefetch_sstables(cf->db, ssts_array, sst_count); + } + + /**** lazy sources defer first-block reads to seek time, which avoids + *** O(N) eager deserialize cost at iterator creation. this matters for + ** workloads that recreate iterators frequently (e.g. MariaDB index_read_map). */ + for (int i = 0; i < sst_count; i++) + { + tidesdb_sstable_t *sst = ssts_array[i]; + + /* reader fd budget -- a full-scan iterator opens its entire source set at once, bounded + * by the max_open cap (clamp keeps it descriptor-safe); only a source set larger than + * max_open fails (a real fd limit). */ + if (!tidesdb_reader_fd_budget_ok(cf->db, sst)) + { + for (int k = i; k < sst_count; k++) tidesdb_sstable_unref(cf->db, ssts_array[k]); + free(ssts_array); + tidesdb_iter_free(*iter); + *iter = NULL; + return TDB_ERR_BUSY; + } + + tidesdb_merge_source_t *sst_source = + tidesdb_merge_source_from_sstable_lazy(cf->db, sst); + if (!sst_source) + { + /* could not open/build a source for this sstable (e.g. EMFILE under fd pressure). + * an iterator that silently omits an sstable returns wrong/incomplete results, so + * fail creation and let the caller retry once descriptors free. */ + for (int k = i; k < sst_count; k++) tidesdb_sstable_unref(cf->db, ssts_array[k]); + free(ssts_array); + tidesdb_iter_free(*iter); + *iter = NULL; + return TDB_ERR_IO; + } + + /* we mark as cached so it wont be freed when popped from heap */ + sst_source->is_cached = 1; + + /* we cache the source for reuse */ + (*iter)->cached_sources[(*iter)->num_cached_sources++] = sst_source; + + /* we add to heap if it has initial data */ + if (sst_source->current_kv != NULL) + { + if (tidesdb_merge_heap_add_source((*iter)->heap, sst_source) != TDB_SUCCESS) + { + /* source is still cached, just not in heap initially */ + } + } + + tidesdb_sstable_unref(cf->db, sst); + } + + free(ssts_array); + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_iter_rebuild_sst_cache + * rebuild cached sstable sources when sstable layout has changed + * @param iter the iterator + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_iter_rebuild_sst_cache(tidesdb_iter_t *iter) +{ + tidesdb_column_family_t *cf = iter->cf; + + /* we clear heap first to remove references to cached sources */ + for (int i = 0; i < iter->heap->num_sources; i++) + { + if (!iter->heap->sources[i]->is_cached) + { + tidesdb_merge_source_free(iter->heap->sources[i]); + } + } + iter->heap->num_sources = 0; + + /* we invalidate cached sources */ + for (int i = 0; i < iter->num_cached_sources; i++) + { + tidesdb_merge_source_free(iter->cached_sources[i]); + } + iter->num_cached_sources = 0; + + /* we collect all sstables with references */ + tidesdb_sstable_t **ssts_array = NULL; + int sst_count = 0; + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + for (int lvl = 0; lvl < num_levels; lvl++) + { + tidesdb_level_t *level = cf->levels[lvl]; + if (!level) continue; + int level_retries = 0; + + retry_level:; + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + tidesdb_sstable_t **sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + int num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + const int num_ssts_recheck = + atomic_load_explicit(&level->num_sstables, memory_order_acquire); + if (num_ssts_recheck < num_ssts) num_ssts = num_ssts_recheck; + + tidesdb_sstable_t **sstables_check = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (sstables_check != sstables) + { + sstables = sstables_check; + num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + } + if (num_ssts == 0) + { + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + continue; + } + + const int sst_count_before_level = sst_count; + int need_retry = 0; + + for (int j = 0; j < num_ssts; j++) + { + tidesdb_sstable_t **current_arr = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (current_arr != sstables) + { + for (int k = sst_count_before_level; k < sst_count; k++) + tidesdb_sstable_unref(cf->db, ssts_array[k]); + sst_count = sst_count_before_level; + need_retry = 1; + break; + } + + tidesdb_sstable_t *sst = sstables[j]; + if (sst) + { + if (!tidesdb_sstable_try_ref(sst)) + { + tidesdb_sstable_t **current_ssts = + atomic_load_explicit(&level->sstables, memory_order_acquire); + + if (current_ssts != sstables) + { + for (int k = sst_count_before_level; k < sst_count; k++) + tidesdb_sstable_unref(cf->db, ssts_array[k]); + sst_count = sst_count_before_level; + need_retry = 1; + break; + } + + /* the array unchanged -- we skip dead sstable */ + continue; + } + + tidesdb_sstable_t **new_array = + realloc(ssts_array, (sst_count + 1) * sizeof(tidesdb_sstable_t *)); + if (!new_array) + { + tidesdb_sstable_unref(cf->db, sst); + for (int k = 0; k < sst_count; k++) + tidesdb_sstable_unref(cf->db, ssts_array[k]); + free(ssts_array); + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + return TDB_ERR_MEMORY; + } + ssts_array = new_array; + ssts_array[sst_count++] = sst; + } + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + + if (need_retry) + { + if (level_retries < TDB_SST_RETRY_MAX_LEVEL_RETRIES) + { + level_retries++; + goto retry_level; + } + + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + + sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + for (int j = 0; j < num_ssts; j++) + { + tidesdb_sstable_t *sst = sstables[j]; + if (sst && tidesdb_sstable_try_ref(sst)) + { + tidesdb_sstable_t **new_array = + realloc(ssts_array, (sst_count + 1) * sizeof(tidesdb_sstable_t *)); + if (!new_array) + { + tidesdb_sstable_unref(cf->db, sst); + break; + } + ssts_array = new_array; + ssts_array[sst_count++] = sst; + } + } + + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + } + } + + if (!ssts_array) return TDB_SUCCESS; + + /* we create cached sources from collected sstables */ + if (!iter->cached_sources || iter->cached_sources_capacity < sst_count) + { + void **new_cached = realloc(iter->cached_sources, sst_count * sizeof(void *)); + if (!new_cached) + { + for (int k = 0; k < sst_count; k++) tidesdb_sstable_unref(cf->db, ssts_array[k]); + free(ssts_array); + return TDB_ERR_MEMORY; + } + iter->cached_sources = new_cached; + iter->cached_sources_capacity = sst_count; + } + + /* we prefetch non-local sstable files in parallel */ + if (cf->db->object_store) + { + tdb_objstore_prefetch_sstables(cf->db, ssts_array, sst_count); + } + + for (int i = 0; i < sst_count; i++) + { + tidesdb_sstable_t *sst = ssts_array[i]; + + /* reader fd budget -- iterator source-cache rebuild also opens the whole set at once, + * bounded by the max_open cap, same as iter_new */ + if (!tidesdb_reader_fd_budget_ok(cf->db, sst)) + { + for (int k = i; k < sst_count; k++) tidesdb_sstable_unref(cf->db, ssts_array[k]); + free(ssts_array); + return TDB_ERR_BUSY; + } + + tidesdb_merge_source_t *sst_source = tidesdb_merge_source_from_sstable_lazy(cf->db, sst); + if (!sst_source) + { + /* could not open/build a source (e.g. EMFILE) -- a rebuilt cache that omits an sstable + * would silently drop data from the scan. surface the failure; the caller retries. */ + for (int k = i; k < sst_count; k++) tidesdb_sstable_unref(cf->db, ssts_array[k]); + free(ssts_array); + return TDB_ERR_IO; + } + sst_source->is_cached = 1; + iter->cached_sources[iter->num_cached_sources++] = sst_source; + tidesdb_sstable_unref(cf->db, sst); + } + free(ssts_array); + + return TDB_SUCCESS; +} + +/** + * tidesdb_iter_seek_memtable_source + * seek a memtable source to the target key + * @param source the memtable source + * @param key the target key + * @param key_size the size of the key + * @param direction 1 for forward (>=), -1 for backward (<=) + */ +static void tidesdb_iter_seek_memtable_source(tidesdb_merge_source_t *source, const uint8_t *key, + const size_t key_size, const int direction) +{ + skip_list_cursor_t *cursor = source->source.memtable.cursor; + + if (direction > 0) + { + /** forward seek -- first entry >= key. seek_ge folds the advance in and is + * robust to a concurrent put splicing a sub-target node into forward[0], + * which a seek+next pair would return as a key below target */ + if (skip_list_cursor_seek_ge(cursor, (uint8_t *)key, key_size) == 0) + { + uint8_t *k, *v; + size_t k_size, v_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &k, &k_size, &v, &v_size, &ttl, &deleted, + &seq) == 0) + { + tidesdb_memtable_source_set_inline_borrowed(source, k, k_size, v, v_size, ttl, seq, + deleted); + } + } + } + else + { + /** backward seek, we find first entry <= key + * skip_list_cursor_seek_for_prev positions directly at target */ + if (skip_list_cursor_seek_for_prev(cursor, (uint8_t *)key, key_size) == 0) + { + uint8_t *k, *v; + size_t k_size, v_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &k, &k_size, &v, &v_size, &ttl, &deleted, + &seq) == 0) + { + tidesdb_memtable_source_set_inline_borrowed(source, k, k_size, v, v_size, ttl, seq, + deleted); + } + } + } +} + +/** + * tidesdb_iter_clear_block_stash + * free all entries in the 2-slot deserialized block stash + */ +static void tidesdb_iter_clear_block_stash(tidesdb_merge_source_t *source) +{ + for (int i = 0; i < 2; i++) + { + if (source->source.sstable.block_stash[i].block) + { + tidesdb_klog_block_free(source->source.sstable.block_stash[i].block); + source->source.sstable.block_stash[i].block = NULL; + } + if (source->source.sstable.block_stash[i].pin) + { + clock_cache_release(source->source.sstable.block_stash[i].pin); + source->source.sstable.block_stash[i].pin = NULL; + } + } +} + +/** + * tidesdb_iter_clear_lazy + * release lazy block state (pinned raw cache data) + */ +static void tidesdb_iter_clear_lazy(tidesdb_merge_source_t *source) +{ + if (source->source.sstable.lazy.pin) + { + clock_cache_release(source->source.sstable.lazy.pin); + } + if (source->source.sstable.lazy.decompressed) + { + free(source->source.sstable.lazy.decompressed); + } + if (source->source.sstable.lazy.bmblock) + { + block_manager_block_release(source->source.sstable.lazy.bmblock); + } + memset(&source->source.sstable.lazy, 0, sizeof(source->source.sstable.lazy)); +} + +/** + * tidesdb_iter_stash_block + * stash a cache-origin block into the 2-slot round-robin stash. + * evicts the oldest entry if both slots are full. + */ +static void tidesdb_iter_stash_block(tidesdb_merge_source_t *source, tidesdb_klog_block_t *block, + clock_cache_entry_t *pin, const uint64_t position) +{ + /* we find an empty slot, or evict slot 0 (shift slot 1 down) */ + int slot = -1; + for (int i = 0; i < 2; i++) + { + if (!source->source.sstable.block_stash[i].block) + { + slot = i; + break; + } + } + + if (slot < 0) + { + /* both full! we evict slot 0, shift slot 1 to slot 0 */ + tidesdb_klog_block_free(source->source.sstable.block_stash[0].block); + if (source->source.sstable.block_stash[0].pin) + clock_cache_release(source->source.sstable.block_stash[0].pin); + source->source.sstable.block_stash[0] = source->source.sstable.block_stash[1]; + slot = 1; + } + + source->source.sstable.block_stash[slot].block = block; + source->source.sstable.block_stash[slot].pin = pin; + source->source.sstable.block_stash[slot].position = position; +} + +static void tidesdb_iter_release_sst_source_block(tidesdb_merge_source_t *source) +{ + if (source->source.sstable.current_rc_block) + { + tidesdb_block_release(source->source.sstable.current_rc_block); + source->source.sstable.current_rc_block = NULL; + } + else if (source->source.sstable.current_block) + { + tidesdb_klog_block_free(source->source.sstable.current_block); + } + source->source.sstable.current_block = NULL; + + if (source->source.sstable.cache_pin) + { + clock_cache_release(source->source.sstable.cache_pin); + source->source.sstable.cache_pin = NULL; + } + + tidesdb_iter_clear_lazy(source); + + if (source->source.sstable.decompressed_data) + { + free(source->source.sstable.decompressed_data); + source->source.sstable.decompressed_data = NULL; + } + if (source->source.sstable.current_block_data) + { + block_manager_block_release(source->source.sstable.current_block_data); + source->source.sstable.current_block_data = NULL; + } + source->source.sstable.current_entry_idx = 0; +} + +/** + * tidesdb_iter_read_klog_block + * read a klog block from cache or disk + * @param sst the sstable + * @param cursor the block manager cursor + * @param cf_name the column family name for cache + * @param has_cf_name whether cf_name is valid + * @param kb_out output klog block + * @param bmblock_out output raw block (if from disk) + * @param decompressed_out output decompressed data (if decompression was needed) + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_iter_read_klog_block(const tidesdb_sstable_t *sst, + block_manager_cursor_t *cursor, const char *cf_name, + const int has_cf_name, tidesdb_klog_block_t **kb_out, + block_manager_block_t **bmblock_out, + uint8_t **decompressed_out, + clock_cache_entry_t **cache_pin_out) +{ + *kb_out = NULL; + *bmblock_out = NULL; + *decompressed_out = NULL; + if (cache_pin_out) *cache_pin_out = NULL; + + /** we try raw-byte cache first, the zero-copy path pins the cache entry + * so keys/values can point directly into cache memory without malloc+memcpy. + */ + if (sst->db->clock_cache && has_cf_name) + { + size_t cached_size = 0; + clock_cache_entry_t *pin = NULL; + const uint8_t *cached_data = tidesdb_cache_raw_block_get_pinned( + sst->db, cf_name, sst->klog_filename, cursor->current_pos, &cached_size, &pin); + if (cached_data) + { + /** cached data may be in indexed format (from tidesdb_sstable_get path) + * which prepends a key offset index header. strip it before deserializing. */ + const uint8_t *deser_ptr = cached_data; + size_t deser_size = cached_size; + + if (cached_size >= TDB_BLOCK_INDEX_HDR_BASE) + { + const uint32_t maybe_magic = decode_uint32_le_compat(cached_data); + if (maybe_magic == TDB_BLOCK_INDEX_MAGIC) + { + const uint32_t hdr_size = decode_uint32_le_compat(cached_data + 4); + if (hdr_size < cached_size) + { + deser_ptr = cached_data + hdr_size; + deser_size = cached_size - hdr_size; + } + } + } + + tidesdb_klog_block_t *kb = NULL; + if (tidesdb_klog_block_deserialize(deser_ptr, deser_size, &kb, 1) != 0 || !kb) + { + clock_cache_release(pin); + return TDB_ERR_CORRUPTION; + } + + /*** zero-copy block keys/values point into the pinned cache entry. + ** data_ref is NULL -- the cache pin keeps the data alive. + * caller releases pin via cache_pin_out when the block is freed. */ + kb->data_ref = NULL; + *kb_out = kb; + if (cache_pin_out) *cache_pin_out = pin; + return TDB_SUCCESS; + } + } + + /* cache miss, we must read from disk */ + block_manager_block_t *bmblock = block_manager_cursor_read(cursor); + if (!bmblock) return TDB_ERR_IO; + + const uint8_t *data = bmblock->data; + size_t data_size = bmblock->size; + + if (sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + *decompressed_out = decompress_data(bmblock->data, bmblock->size, &data_size, + sst->config->compression_algorithm); + if (*decompressed_out) + { + data = *decompressed_out; + } + } + + /** we cache in indexed format so both point-lookup and iterator seek paths + * benefit from O(log N) binary search on subsequent cache hits */ + if (sst->db->clock_cache && has_cf_name) + { + uint8_t *indexed_data = NULL; + size_t indexed_size = 0; + if (tidesdb_build_indexed_block_data(data, data_size, &indexed_data, &indexed_size) == 0) + { + tidesdb_cache_raw_block_put(sst->db, cf_name, sst->klog_filename, cursor->current_pos, + indexed_data, indexed_size); + free(indexed_data); + } + else + { + tidesdb_cache_raw_block_put(sst->db, cf_name, sst->klog_filename, cursor->current_pos, + data, data_size); + } + } + + tidesdb_klog_block_t *kb = NULL; + /** zero-copy keys/values point into data buffer (decompressed or bmblock->data). + * the caller keeps these alive via decompressed_out and bmblock_out. */ + if (tidesdb_klog_block_deserialize(data, data_size, &kb, 1) != 0 || !kb) + { + if (*decompressed_out) free(*decompressed_out); + *decompressed_out = NULL; + block_manager_block_release(bmblock); + return TDB_ERR_CORRUPTION; + } + + *kb_out = kb; + *bmblock_out = bmblock; + return TDB_SUCCESS; +} + +/** + * tidesdb_iter_create_kv_from_block + * create a kv pair from a klog block entry + * @param iter the iterator + * @param sst the sstable + * @param kb the klog block + * @param idx the entry index + * @return the created kv pair, or NULL on failure + */ +static tidesdb_kv_pair_t *tidesdb_iter_create_kv_from_block(const tidesdb_iter_t *iter, + tidesdb_sstable_t *sst, + const tidesdb_klog_block_t *kb, + const int idx) +{ + const uint8_t *value = kb->inline_values[idx]; + uint8_t *vlog_value = NULL; + + if (kb->entries[idx].vlog_offset > 0) + { + if (tidesdb_vlog_read_value(iter->cf->db, sst, kb->entries[idx].vlog_offset, + kb->entries[idx].value_size, &vlog_value) == TDB_SUCCESS) + { + value = vlog_value; + } + } + + tidesdb_kv_pair_t *kv = tidesdb_kv_pair_create( + kb->keys[idx], kb->entries[idx].key_size, value, kb->entries[idx].value_size, + kb->entries[idx].ttl, kb->entries[idx].seq, + kb->entries[idx].flags & TDB_KV_TOMBSTONE_FLAG_MASK); + + free(vlog_value); + return kv; +} + +/** + * tidesdb_iter_seek_btree_source_forward + * seek a btree source forward to find first entry >= key + * @param source the btree source + * @param key the target key + * @param key_size the size of the key + */ +static void tidesdb_iter_seek_btree_source_forward(tidesdb_merge_source_t *source, + const uint8_t *key, const size_t key_size) +{ + btree_cursor_t *cursor = source->source.btree.cursor; + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (btree_cursor_seek(cursor, key, key_size) != 0) + { + return; + } + + uint8_t *found_key = NULL, *value = NULL; + size_t found_key_size = 0, value_size = 0; + uint64_t vlog_offset = 0, seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (btree_cursor_get(cursor, &found_key, &found_key_size, &value, &value_size, &vlog_offset, + &seq, &ttl, &deleted) != 0) + { + return; + } + + const uint8_t *actual_value = value; + size_t actual_value_size = value_size; + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_btree_read_vlog_value(source->source.btree.vlog_cursor, vlog_offset, + source->config, &vlog_value, &actual_value_size, + value_size) == 0) + { + actual_value = vlog_value; + } + else + { + actual_value = NULL; + actual_value_size = 0; + } + } + + source->current_kv = tidesdb_kv_pair_create(found_key, found_key_size, actual_value, + actual_value_size, ttl, seq, deleted); + free(vlog_value); +} + +/** + * tidesdb_iter_seek_btree_source_backward + * seek a btree source backward to find last entry <= key + * @param source the btree source + * @param key the target key + * @param key_size the size of the key + */ +static void tidesdb_iter_seek_btree_source_backward(tidesdb_merge_source_t *source, + const uint8_t *key, const size_t key_size) +{ + btree_cursor_t *cursor = source->source.btree.cursor; + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (btree_cursor_seek(cursor, key, key_size) != 0) + { + if (btree_cursor_goto_last(cursor) != 0) return; + } + + uint8_t *found_key = NULL, *value = NULL; + size_t found_key_size = 0, value_size = 0; + uint64_t vlog_offset = 0, seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (btree_cursor_get(cursor, &found_key, &found_key_size, &value, &value_size, &vlog_offset, + &seq, &ttl, &deleted) != 0) + { + return; + } + + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(source->source.btree.db, source->config, &comparator_fn, + &comparator_ctx); + + const int cmp = comparator_fn(found_key, found_key_size, key, key_size, comparator_ctx); + if (cmp > 0) + { + if (btree_cursor_prev(cursor) != 0) return; + + if (btree_cursor_get(cursor, &found_key, &found_key_size, &value, &value_size, &vlog_offset, + &seq, &ttl, &deleted) != 0) + { + return; + } + } + + const uint8_t *actual_value = value; + size_t actual_value_size = value_size; + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_btree_read_vlog_value(source->source.btree.vlog_cursor, vlog_offset, + source->config, &vlog_value, &actual_value_size, + value_size) == 0) + { + actual_value = vlog_value; + } + else + { + actual_value = NULL; + actual_value_size = 0; + } + } + + source->current_kv = tidesdb_kv_pair_create(found_key, found_key_size, actual_value, + actual_value_size, ttl, seq, deleted); + free(vlog_value); +} + +/** + * tidesdb_iter_seek_sstable_source_forward + * seek an sstable source forward to find first entry >= key + * @param iter the iterator + * @param source the sstable source + * @param key the target key + * @param key_size the size of the key + */ +static void tidesdb_iter_seek_sstable_source_forward(const tidesdb_iter_t *iter, + tidesdb_merge_source_t *source, + const uint8_t *key, const size_t key_size) +{ + tidesdb_sstable_t *sst = source->source.sstable.sst; + block_manager_cursor_t *cursor = source->source.sstable.klog_cursor; + + /** we use cached comparator from sst (resolved at load/create time) to avoid + * per-seek registry lookup via tidesdb_resolve_comparator */ + skip_list_comparator_fn comparator_fn = sst->cached_comparator_fn; + void *comparator_ctx = sst->cached_comparator_ctx; + if (TDB_UNLIKELY(!comparator_fn)) + { + tidesdb_resolve_comparator(sst->db, sst->config, &comparator_fn, &comparator_ctx); + } + + /** if current block is already loaded and target key is within its range, + * we skip the expensive release + read + deserialize cycle */ + const tidesdb_klog_block_t *cb = source->source.sstable.current_block; + if (cb && cb->num_entries > 0) + { + const int cmp_first = + comparator_fn(cb->keys[0], cb->entries[0].key_size, key, key_size, comparator_ctx); + const int cmp_last = + comparator_fn(cb->keys[cb->num_entries - 1], cb->entries[cb->num_entries - 1].key_size, + key, key_size, comparator_ctx); + + if (cmp_first <= 0 && cmp_last >= 0) + { + /* target is within this block, simple binary search in place */ + int left = 0; + int right = (int)cb->num_entries - 1; + int result_idx = (int)cb->num_entries; + + while (left <= right) + { + const int mid = left + (right - left) / 2; + const int cmp = comparator_fn(cb->keys[mid], cb->entries[mid].key_size, key, + key_size, comparator_ctx); + if (cmp >= 0) + { + result_idx = mid; + right = mid - 1; + } + else + { + left = mid + 1; + } + } + + if ((uint32_t)result_idx < cb->num_entries) + { + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + source->source.sstable.current_entry_idx = result_idx; + source->current_kv = tidesdb_iter_create_kv_from_block(iter, sst, cb, result_idx); + return; + } + } + else if (cmp_first > 0) + { + /* target is before this block, we use first entry */ + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + source->source.sstable.current_entry_idx = 0; + source->current_kv = tidesdb_iter_create_kv_from_block(iter, sst, cb, 0); + return; + } + else if (cmp_last < 0) + { + /**** target is past current block, thus fall through to block_index lookup. + *** we skip sequential cursor_next here because TPC-C style random access + ** almost never hits the adjacent block, and cursor_next triggers a pread + * syscall to read the next block header which is wasted I/O. */ + tidesdb_iter_release_sst_source_block(source); + } + } + else if (source->source.sstable.lazy.data && source->source.sstable.lazy.idx_count > 0) + { + /** the block is pinned but not deserialized. + * we use block index to check if target is within this block's key range. */ + const uint8_t *idx_base = source->source.sstable.lazy.idx_base; + const uint32_t idx_count = source->source.sstable.lazy.idx_count; + const uint8_t *bdata = source->source.sstable.lazy.block_data; + const size_t bdata_size = source->source.sstable.lazy.block_data_size; + + const uint8_t *first_ie = idx_base; + const uint32_t fk_off = decode_uint32_le_compat(first_ie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t fk_sz = decode_uint32_le_compat(first_ie + TDB_BLOCK_IDX_KEY_SIZE); + const uint8_t *last_ie = idx_base + (idx_count - 1) * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t lk_off = decode_uint32_le_compat(last_ie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t lk_sz = decode_uint32_le_compat(last_ie + TDB_BLOCK_IDX_KEY_SIZE); + + /* validate the first/last key offsets before comparing into the block */ + const int range_ok = fk_off <= bdata_size && fk_sz <= bdata_size - fk_off && + lk_off <= bdata_size && lk_sz <= bdata_size - lk_off; + const int cmp_first = + range_ok ? comparator_fn(bdata + fk_off, fk_sz, key, key_size, comparator_ctx) : 1; + const int cmp_last = + range_ok ? comparator_fn(bdata + lk_off, lk_sz, key, key_size, comparator_ctx) : -1; + + if (range_ok && cmp_first <= 0 && cmp_last >= 0) + { + /* target is within this lazy block, thus we utilize binary search via block index */ + int32_t left = 0, right = (int32_t)idx_count - 1, found = -1; + while (left <= right) + { + const int32_t mid = left + (right - left) / 2; + const uint8_t *ie = idx_base + mid * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t k_off = decode_uint32_le_compat(ie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t k_sz = decode_uint32_le_compat(ie + TDB_BLOCK_IDX_KEY_SIZE); + if (k_off > bdata_size || k_sz > bdata_size - k_off) break; + const int cmp = comparator_fn(bdata + k_off, k_sz, key, key_size, comparator_ctx); + if (cmp >= 0) + { + found = mid; + right = mid - 1; + } + else + { + left = mid + 1; + } + } + + if (found >= 0) + { + /* we extract entry metadata from raw data */ + const uint8_t *fie = idx_base + found * TDB_BLOCK_INDEX_ENTRY_STRIDE; + const uint32_t e_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_ENTRY_OFF); + const uint32_t k_off = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_OFF); + const uint32_t k_sz = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_KEY_SIZE); + + const uint8_t *eptr = bdata + e_off; + size_t erem = source->source.sstable.lazy.block_data_size - e_off; + uint8_t flags = *eptr++; + erem--; + uint64_t ks, vs; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + /* we read abs_seq from index */ + const uint32_t seq_lo = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t seq_hi = decode_uint32_le_compat(fie + TDB_BLOCK_IDX_SEQ_HI); + const uint64_t seq = ((uint64_t)seq_hi << TDB_U64_HI_LO_SHIFT) | seq_lo; + /* we skip seq varint */ + uint64_t dummy; + br = decode_varint(eptr, &dummy, (int)erem); + eptr += br; + erem -= br; + int64_t ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + uint64_t vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + br = decode_varint(eptr, &vlog_offset, (int)erem); + } + + const uint8_t *fkey = bdata + k_off; + const uint8_t *fvalue = NULL; + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0) + { + fvalue = fkey + k_sz; + } + + /* we handle vlog values */ + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_vlog_read_value(iter->cf->db, sst, vlog_offset, (size_t)vs, + &vlog_value) == TDB_SUCCESS) + { + fvalue = vlog_value; + } + } + + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + source->current_kv = + tidesdb_kv_pair_create(fkey, (size_t)k_sz, fvalue, (size_t)vs, ttl, seq, + flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + + source->source.sstable.lazy.entry_idx = found; + source->source.sstable.current_entry_idx = found; + /* lazy.block_data_size is the decompressed size, not the on-disk + * size cursor_next needs to advance current_pos. invalidate so + * cursor_next re-reads the size header from disk. */ + cursor->block_size_valid = 0; + return; + } + } + else if (cmp_first > 0) + { + /* the target is before this lazy block, thus we use first entry */ + const uint32_t e_off = decode_uint32_le_compat(first_ie + TDB_BLOCK_IDX_ENTRY_OFF); + const uint8_t *eptr = bdata + e_off; + size_t erem = source->source.sstable.lazy.block_data_size - e_off; + uint8_t flags = *eptr++; + erem--; + uint64_t ks, vs; + int br = decode_varint(eptr, &ks, (int)erem); + eptr += br; + erem -= br; + br = decode_varint(eptr, &vs, (int)erem); + eptr += br; + erem -= br; + const uint32_t seq_lo = decode_uint32_le_compat(first_ie + TDB_BLOCK_IDX_SEQ_LO); + const uint32_t seq_hi = decode_uint32_le_compat(first_ie + TDB_BLOCK_IDX_SEQ_HI); + const uint64_t seq = ((uint64_t)seq_hi << TDB_U64_HI_LO_SHIFT) | seq_lo; + uint64_t dummy; + br = decode_varint(eptr, &dummy, (int)erem); + eptr += br; + erem -= br; + int64_t ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + ttl = decode_int64_le_compat(eptr); + eptr += sizeof(int64_t); + erem -= sizeof(int64_t); + } + uint64_t vlog_offset = 0; + if (flags & TDB_KV_FLAG_HAS_VLOG) + { + br = decode_varint(eptr, &vlog_offset, (int)erem); + } + const uint8_t *fvalue = NULL; + if (!(flags & TDB_KV_FLAG_HAS_VLOG) && vs > 0) + { + fvalue = bdata + fk_off + fk_sz; + } + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_vlog_read_value(iter->cf->db, sst, vlog_offset, (size_t)vs, + &vlog_value) == TDB_SUCCESS) + { + fvalue = vlog_value; + } + } + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + source->current_kv = + tidesdb_kv_pair_create(bdata + fk_off, (size_t)fk_sz, fvalue, (size_t)vs, ttl, seq, + flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + source->source.sstable.lazy.entry_idx = 0; + source->source.sstable.current_entry_idx = 0; + return; + } + else if (cmp_last < 0) + { + /** target past lazy block, thus we must release and fall through to block_index + * lookup below instead of goto scan_blocks which would scan linearly */ + tidesdb_iter_clear_lazy(source); + tidesdb_iter_release_sst_source_block(source); + } + } + + /** we stash cache-origin block before releasing so a subsequent seek + * to the same block position can skip deserialization entirely */ + if (source->source.sstable.current_block && source->source.sstable.cache_pin && + !source->source.sstable.current_block_data && !source->source.sstable.decompressed_data) + { + tidesdb_iter_stash_block(source, source->source.sstable.current_block, + source->source.sstable.cache_pin, cursor->current_pos); + source->source.sstable.current_block = NULL; + source->source.sstable.cache_pin = NULL; + } + tidesdb_iter_release_sst_source_block(source); + + /* we use block index to find starting position */ + uint64_t block_position = 0; + if (sst->block_indexes && sst->block_indexes->count > 0) + { + compact_block_index_find_predecessor(sst->block_indexes, key, key_size, &block_position); + } + + if (block_position > 0) + { + block_manager_cursor_goto(cursor, block_position); + } + else + { + block_manager_cursor_goto_first(cursor); + } + + const char *cf_name = sst->cf_name; + const int has_cf_name = (cf_name[0] != '\0'); + + int blocks_scanned = 0; + + while (blocks_scanned < TDB_ITER_SEEK_MAX_BLOCKS_SCAN) + { + if (sst->klog_data_end_offset > 0 && cursor->current_pos >= sst->klog_data_end_offset) + { + break; + } + + /** we check stash first, essentially stashed blocks are already deserialized + * from a previous seek, so we use them directly */ + const uint64_t scan_pos = cursor->current_pos; + int stash_hit = 0; + for (int si = 0; si < 2; si++) + { + if (source->source.sstable.block_stash[si].block && + source->source.sstable.block_stash[si].position == scan_pos) + { + tidesdb_klog_block_t *kb = source->source.sstable.block_stash[si].block; + clock_cache_entry_t *pin = source->source.sstable.block_stash[si].pin; + source->source.sstable.block_stash[si].block = NULL; + source->source.sstable.block_stash[si].pin = NULL; + stash_hit = 1; + blocks_scanned++; + + const int cmp_first = comparator_fn(kb->keys[0], kb->entries[0].key_size, key, + key_size, comparator_ctx); + + if (cmp_first > 0) + { + source->source.sstable.current_block_data = NULL; + source->source.sstable.current_rc_block = NULL; + source->source.sstable.current_block = kb; + source->source.sstable.decompressed_data = NULL; + source->source.sstable.cache_pin = pin; + source->source.sstable.current_entry_idx = 0; + source->current_kv = tidesdb_iter_create_kv_from_block(iter, sst, kb, 0); + return; + } + + const int cmp_last = comparator_fn(kb->keys[kb->num_entries - 1], + kb->entries[kb->num_entries - 1].key_size, key, + key_size, comparator_ctx); + + if (cmp_last >= 0) + { + int left = 0; + int right = (int)kb->num_entries - 1; + int result_idx = (int)kb->num_entries; + + while (left <= right) + { + const int mid = left + (right - left) / 2; + const int cmp = comparator_fn(kb->keys[mid], kb->entries[mid].key_size, key, + key_size, comparator_ctx); + if (cmp >= 0) + { + result_idx = mid; + right = mid - 1; + } + else + { + left = mid + 1; + } + } + + if ((uint32_t)result_idx < kb->num_entries) + { + source->source.sstable.current_block_data = NULL; + source->source.sstable.current_rc_block = NULL; + source->source.sstable.current_block = kb; + source->source.sstable.decompressed_data = NULL; + source->source.sstable.cache_pin = pin; + source->source.sstable.current_entry_idx = result_idx; + source->current_kv = + tidesdb_iter_create_kv_from_block(iter, sst, kb, result_idx); + return; + } + } + + tidesdb_klog_block_free(kb); + if (pin) clock_cache_release(pin); + break; + } + } + if (stash_hit) + { + if (block_manager_cursor_next(cursor) != 0) break; + continue; + } + + /***** raw seek, we read block data without full deserialization. + **** we binary search the raw bytes for the first entry >= target key + *** using tidesdb_klog_block_seek_raw, which builds a lightweight + ** key-offset index via a single varint scan. the full O(N) + * deserialization is deferred to the first next() call. */ + const uint8_t *raw_data = NULL; + size_t raw_size = 0; + clock_cache_entry_t *pin = NULL; + block_manager_block_t *bmblock = NULL; + uint8_t *decompressed = NULL; + + /* we try cache first */ + if (sst->db->clock_cache && has_cf_name) + { + raw_data = tidesdb_cache_raw_block_get_pinned(sst->db, cf_name, sst->klog_filename, + cursor->current_pos, &raw_size, &pin); + } + + if (!raw_data) + { + /* cache miss, we must read from disk */ + bmblock = block_manager_cursor_read(cursor); + if (!bmblock) + { + if (block_manager_cursor_next(cursor) != 0) break; + continue; + } + + raw_data = bmblock->data; + raw_size = bmblock->size; + + if (sst->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t dec_size = 0; + decompressed = decompress_data(bmblock->data, bmblock->size, &dec_size, + sst->config->compression_algorithm); + if (decompressed) + { + raw_data = decompressed; + raw_size = dec_size; + } + } + + /** cache in indexed format so subsequent seeks hit the O(log N) + * binary search fast path instead of re-scanning all varints */ + if (sst->db->clock_cache && has_cf_name) + { + uint8_t *indexed_data = NULL; + size_t indexed_size = 0; + if (tidesdb_build_indexed_block_data(raw_data, raw_size, &indexed_data, + &indexed_size) == 0) + { + tidesdb_cache_raw_block_put(sst->db, cf_name, sst->klog_filename, + cursor->current_pos, indexed_data, indexed_size); + free(indexed_data); + } + else + { + tidesdb_cache_raw_block_put(sst->db, cf_name, sst->klog_filename, + cursor->current_pos, raw_data, raw_size); + } + } + } + + blocks_scanned++; + + /***** seek_raw handles both indexed (TDB_BLOCK_INDEX_MAGIC) and raw + **** formats internally so we pass the full data including any index + *** header. the stripped block_data is only needed for lazy state + ** so next() can deserialize the raw entries later. */ + const uint8_t *block_data = raw_data; + size_t block_data_size = raw_size; + + if (raw_size >= TDB_BLOCK_INDEX_HDR_BASE) + { + const uint32_t maybe_magic = decode_uint32_le_compat(raw_data); + if (maybe_magic == TDB_BLOCK_INDEX_MAGIC) + { + const uint32_t hdr_size = decode_uint32_le_compat(raw_data + 4); + if (hdr_size < raw_size) + { + block_data = raw_data + hdr_size; + block_data_size = raw_size - hdr_size; + } + } + } + + tidesdb_klog_entry_t found_entry = {0}; + const uint8_t *found_key = NULL; + const uint8_t *found_value = NULL; + int found_idx = -1; + uint32_t num_entries = 0; + + const int seek_rc = tidesdb_klog_block_seek_raw( + raw_data, raw_size, key, key_size, comparator_fn, comparator_ctx, &found_entry, + &found_key, &found_value, &found_idx, &num_entries); + + if (seek_rc == 0 && found_idx >= 0) + { + /* found entry >= target. resolve vlog if needed */ + const uint8_t *value = found_value; + uint8_t *vlog_value = NULL; + + if (found_entry.vlog_offset > 0) + { + if (tidesdb_vlog_read_value(iter->cf->db, sst, found_entry.vlog_offset, + found_entry.value_size, &vlog_value) == TDB_SUCCESS) + { + value = vlog_value; + } + } + + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + source->current_kv = tidesdb_kv_pair_create( + found_key, found_entry.key_size, value, found_entry.value_size, found_entry.ttl, + found_entry.seq, found_entry.flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vlog_value); + + /**** we set up lazy state, the full deserialization is deferred to next(). + *** if the block is in indexed format, we extract index base and count + ** so merge_source_advance can parse entries incrementally without + * full block deserialization. */ + tidesdb_iter_clear_lazy(source); + source->source.sstable.lazy.data = raw_data; + source->source.sstable.lazy.size = raw_size; + source->source.sstable.lazy.pin = pin; + source->source.sstable.lazy.block_data = block_data; + source->source.sstable.lazy.block_data_size = block_data_size; + source->source.sstable.lazy.idx_base = NULL; + source->source.sstable.lazy.idx_count = 0; + + /* we extract index pointers from indexed format for incremental advance */ + if (raw_size >= TDB_BLOCK_INDEX_HDR_BASE) + { + const uint32_t magic = decode_uint32_le_compat(raw_data); + if (magic == TDB_BLOCK_INDEX_MAGIC) + { + const uint32_t idx_cnt = decode_uint32_le_compat(raw_data + 8); + source->source.sstable.lazy.idx_base = raw_data + TDB_BLOCK_INDEX_HDR_BASE; + source->source.sstable.lazy.idx_count = idx_cnt; + } + } + + source->source.sstable.lazy.entry_idx = found_idx; + source->source.sstable.lazy.bmblock = bmblock; + source->source.sstable.lazy.decompressed = decompressed; + source->source.sstable.current_entry_idx = found_idx; + /* cursor->current_block_size must hold the on-disk (compressed) size + * because cursor_next advances cursor->current_pos by header + size + + * footer. when we read via cursor_read we have bmblock with the + * on-disk size; otherwise (cache hit) we leave block_size_valid clear + * so cursor_next re-reads the header from disk. */ + if (bmblock) + { + cursor->current_block_size = bmblock->size; + cursor->block_size_valid = 1; + } + else + { + cursor->block_size_valid = 0; + } + return; + } + + /* target is past this block -- same on-disk-size invariant as above */ + if (bmblock) + { + cursor->current_block_size = bmblock->size; + cursor->block_size_valid = 1; + } + else + { + cursor->block_size_valid = 0; + } + + if (pin) clock_cache_release(pin); + if (decompressed) free(decompressed); + if (bmblock) block_manager_block_release(bmblock); + + if (block_manager_cursor_next(cursor) != 0) break; + } +} + +/** + * tidesdb_iter_seek_txn_ops_source + * seek a txn ops source to the target key + * uses binary search on the sorted index array + * @param source the txn ops source + * @param key the target key + * @param key_size the size of the key + * @param direction 1 for forward (first entry >= key), -1 for backward (last entry <= key) + */ +static void tidesdb_iter_seek_txn_ops_source(tidesdb_merge_source_t *source, const uint8_t *key, + const size_t key_size, const int direction) +{ + const tidesdb_txn_t *txn = source->source.txn_ops.txn; + const tidesdb_column_family_t *cf = source->source.txn_ops.cf; + const int count = source->source.txn_ops.count; + const int *indices = source->source.txn_ops.sorted_indices; + + /* we resolve the comparator */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + if (!comparator_fn) comparator_fn = skip_list_comparator_memcmp; + + /* we utilize binary search for the target position */ + int lo = 0, hi = count; + while (lo < hi) + { + const int mid = lo + (hi - lo) / 2; + const tidesdb_txn_op_t *op = &txn->ops[indices[mid]]; + const int cmp = comparator_fn(op->key, op->key_size, key, key_size, comparator_ctx); + if (cmp < 0) + lo = mid + 1; + else + hi = mid; + } + + if (direction > 0) + { + /* forward -- first entry >= key */ + if (lo < count) + { + source->source.txn_ops.pos = lo; + const tidesdb_txn_op_t *op = &txn->ops[indices[lo]]; + source->current_kv = + tidesdb_kv_pair_create(op->key, op->key_size, op->value, op->value_size, op->ttl, + UINT64_MAX, tidesdb_txn_op_kv_flags(op)); + } + } + else + { + /** backward -- last entry <= key + * if lo points to an exact match, we use it; otherwise use lo-1 */ + int pos = lo; + if (pos < count) + { + const tidesdb_txn_op_t *op = &txn->ops[indices[pos]]; + const int cmp = comparator_fn(op->key, op->key_size, key, key_size, comparator_ctx); + if (cmp > 0) pos--; + } + else + { + pos = count - 1; + } + + if (pos >= 0) + { + source->source.txn_ops.pos = pos; + const tidesdb_txn_op_t *op = &txn->ops[indices[pos]]; + source->current_kv = + tidesdb_kv_pair_create(op->key, op->key_size, op->value, op->value_size, op->ttl, + UINT64_MAX, tidesdb_txn_op_kv_flags(op)); + } + } +} + +/** + * tidesdb_iter_seek_sstable_source_backward + * seek an sstable source backward to find last entry <= key + * @param iter the iterator + * @param source the sstable source + * @param key the target key + * @param key_size the size of the key + */ +static void tidesdb_iter_seek_sstable_source_backward(const tidesdb_iter_t *iter, + tidesdb_merge_source_t *source, + const uint8_t *key, const size_t key_size) +{ + tidesdb_sstable_t *sst = source->source.sstable.sst; + block_manager_cursor_t *cursor = source->source.sstable.klog_cursor; + + /** we use cached comparator from sst (resolved at load/create time) to avoid + * per-seek registry lookup via tidesdb_resolve_comparator */ + skip_list_comparator_fn comparator_fn = sst->cached_comparator_fn; + void *comparator_ctx = sst->cached_comparator_ctx; + if (TDB_UNLIKELY(!comparator_fn)) + { + tidesdb_resolve_comparator(sst->db, sst->config, &comparator_fn, &comparator_ctx); + } + + /* fast path is we reuse current block if target key is within its range */ + const tidesdb_klog_block_t *cb = source->source.sstable.current_block; + if (cb && cb->num_entries > 0) + { + const int cmp_first = + comparator_fn(cb->keys[0], cb->entries[0].key_size, key, key_size, comparator_ctx); + const int cmp_last = + comparator_fn(cb->keys[cb->num_entries - 1], cb->entries[cb->num_entries - 1].key_size, + key, key_size, comparator_ctx); + + if (cmp_first <= 0 && cmp_last >= 0) + { + /* target is within this block, we utilize binary search for last entry <= target */ + int left = 0; + int right = (int)cb->num_entries - 1; + int result_idx = -1; + + while (left <= right) + { + const int mid = left + (right - left) / 2; + const int cmp = comparator_fn(cb->keys[mid], cb->entries[mid].key_size, key, + key_size, comparator_ctx); + if (cmp <= 0) + { + result_idx = mid; + left = mid + 1; + } + else + { + right = mid - 1; + } + } + + if (result_idx >= 0) + { + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + source->source.sstable.current_entry_idx = result_idx; + source->current_kv = tidesdb_iter_create_kv_from_block(iter, sst, cb, result_idx); + return; + } + } + else if (cmp_last < 0) + { + /* target is after this block, we must use last entry */ + if (source->current_kv) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + } + const int last = (int)cb->num_entries - 1; + source->source.sstable.current_entry_idx = last; + source->current_kv = tidesdb_iter_create_kv_from_block(iter, sst, cb, last); + return; + } + } + + tidesdb_iter_release_sst_source_block(source); + + /* we use block index to find starting position */ + uint64_t block_position = 0; + if (sst->block_indexes && sst->block_indexes->count > 0) + { + compact_block_index_find_predecessor(sst->block_indexes, key, key_size, &block_position); + } + + if (block_position > 0) + { + block_manager_cursor_goto(cursor, block_position); + } + else + { + block_manager_cursor_goto_first(cursor); + } + + /* we use cached CF name from sst struct to avoid repeated path parsing */ + const char *cf_name = sst->cf_name; + const int has_cf_name = (cf_name[0] != '\0'); + + tidesdb_klog_block_t *last_valid_block = NULL; + int last_valid_idx = -1; + block_manager_block_t *last_valid_bmblock = NULL; + uint8_t *last_valid_decompressed = NULL; + clock_cache_entry_t *last_valid_pin = NULL; + + int blocks_scanned = 0; + + while (blocks_scanned < TDB_ITER_SEEK_MAX_BLOCKS_SCAN) + { + if (sst->klog_data_end_offset > 0 && cursor->current_pos >= sst->klog_data_end_offset) + { + break; + } + + tidesdb_klog_block_t *kb = NULL; + block_manager_block_t *bmblock = NULL; + uint8_t *decompressed = NULL; + clock_cache_entry_t *pin = NULL; + + const int read_result = tidesdb_iter_read_klog_block(sst, cursor, cf_name, has_cf_name, &kb, + &bmblock, &decompressed, &pin); + if (read_result != TDB_SUCCESS) + { + if (block_manager_cursor_next(cursor) != 0) break; + continue; + } + blocks_scanned++; + + /* we check if first key > target (use previous block) */ + const int cmp_first = + comparator_fn(kb->keys[0], kb->entries[0].key_size, key, key_size, comparator_ctx); + + if (cmp_first > 0) + { + tidesdb_klog_block_free(kb); + if (pin) clock_cache_release(pin); + if (decompressed) free(decompressed); + if (bmblock) block_manager_block_release(bmblock); + break; + } + + /* we utilize binary search for last entry <= target */ + int left = 0; + int right = (int)kb->num_entries - 1; + int result_idx = -1; + + while (left <= right) + { + const int mid = left + (right - left) / 2; + const int cmp = comparator_fn(kb->keys[mid], kb->entries[mid].key_size, key, key_size, + comparator_ctx); + + if (cmp <= 0) + { + result_idx = mid; + left = mid + 1; + } + else + { + right = mid - 1; + } + } + + if (result_idx >= 0) + { + /* we clean up previous candidate */ + if (last_valid_block) tidesdb_klog_block_free(last_valid_block); + if (last_valid_pin) clock_cache_release(last_valid_pin); + if (last_valid_decompressed) free(last_valid_decompressed); + if (last_valid_bmblock) block_manager_block_release(last_valid_bmblock); + + last_valid_block = kb; + last_valid_idx = result_idx; + last_valid_bmblock = bmblock; + last_valid_decompressed = decompressed; + last_valid_pin = pin; + } + else + { + tidesdb_klog_block_free(kb); + if (pin) clock_cache_release(pin); + if (decompressed) free(decompressed); + if (bmblock) block_manager_block_release(bmblock); + } + + if (block_manager_cursor_next(cursor) != 0) break; + } + + /* we use the last valid entry we found */ + if (last_valid_block && last_valid_idx >= 0) + { + source->source.sstable.current_block = last_valid_block; + source->source.sstable.current_block_data = last_valid_bmblock; + source->source.sstable.current_rc_block = NULL; + source->source.sstable.decompressed_data = last_valid_decompressed; + source->source.sstable.cache_pin = last_valid_pin; + source->source.sstable.current_entry_idx = last_valid_idx; + source->current_kv = + tidesdb_iter_create_kv_from_block(iter, sst, last_valid_block, last_valid_idx); + } + else + { + /* no valid block found -- we release any leftover pin */ + if (last_valid_pin) clock_cache_release(last_valid_pin); + } +} + +/** + * tidesdb_iter_find_visible_entry + * find the first visible entry from the heap + * @param iter the iterator + * @param direction 1 for forward (min-heap), -1 for backward (max-heap) + * @return TDB_SUCCESS if found, TDB_ERR_NOT_FOUND otherwise + */ +static int tidesdb_iter_find_visible_entry(tidesdb_iter_t *iter, const int direction) +{ + const int ns = iter->heap->num_sources; + if (ns <= 1) + { + /* 0 or 1 sources -- already a valid heap */ + } + else if (ns == 2) + { + const int cmp = + (direction > 0) ? heap_compare(iter->heap, 0, 1) : heap_compare_max(iter->heap, 0, 1); + if ((direction > 0 && cmp > 0) || (direction < 0 && cmp < 0)) + { + heap_swap(&iter->heap->sources[0], &iter->heap->sources[1]); + } + } + else if (direction > 0) + { + for (int i = (ns / 2) - 1; i >= 0; i--) + { + heap_sift_down(iter->heap, i); + } + } + else + { + for (int i = (ns / 2) - 1; i >= 0; i--) + { + heap_sift_down_max(iter->heap, i); + } + } + + /* we find first visible entry */ + while (!tidesdb_merge_heap_empty(iter->heap)) + { + tidesdb_kv_pair_t *kv = (direction > 0) ? tidesdb_merge_heap_pop(iter->heap, NULL) + : tidesdb_merge_heap_pop_max(iter->heap); + if (!kv) break; + + const int visible = tidesdb_iter_kv_visible(iter, kv); + if (visible == -1) + { + tidesdb_iter_skip_tombstone_versions(iter, kv, direction); + tidesdb_kv_pair_free(kv); + continue; + } + + if (visible == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + iter->current = kv; + iter->valid = 1; + return TDB_SUCCESS; + } + + return TDB_ERR_NOT_FOUND; +} + +int tidesdb_iter_seek(tidesdb_iter_t *iter, const uint8_t *key, const size_t key_size) +{ + if (!iter || !key || key_size == 0) return TDB_ERR_INVALID_ARGS; + + /***** we detect strictly-forward seeks (new target > last result) before freeing + **** iter->current. a source whose current_kv is already >= target is then still the + *** correct "first entry >= target" answer, since no source has an entry in + ** (last_result, current_kv) and a strictly-greater target keeps [target, current_kv) + * inside that gap. the comparison must be strict-- iter_next pops iter->current and + * advances its sources one entry past it, so a re-seek to exactly iter->current has + * to fall through and re-seek -- iter->current itself sits behind those cursors and + * a >= test would skip it, returning the following key. */ + int forward_monotonic = 0; + const skip_list_comparator_fn cmp_fn = iter->heap->comparator; + void *cmp_ctx = iter->heap->comparator_ctx; + + if (iter->valid && iter->direction == 1 && iter->current && cmp_fn) + { + const int cmp = + cmp_fn(key, key_size, iter->current->key, iter->current->entry.key_size, cmp_ctx); + if (cmp > 0) forward_monotonic = 1; + } + + tidesdb_kv_pair_free(iter->current); + iter->current = NULL; + iter->valid = 0; + iter->direction = 1; + + /****** we only rebuild SST cache on initial build (num_cached_sources == 0). + ***** the iterator holds refs to all sstables it needs and has snapshot semantics + **** via its transaction -- new sstables from later flushes contain data already + *** visible through memtable sources, and compaction cannot delete ref'd sstables. + */ + if (iter->num_cached_sources == 0) + { + const int result = tidesdb_iter_rebuild_sst_cache(iter); + if (result != TDB_SUCCESS) return result; + } + else + { + /* we free non-cached sources that are currently in the heap */ + for (int i = 0; i < iter->heap->num_sources; i++) + { + if (!iter->heap->sources[i]->is_cached) + { + tidesdb_merge_source_free(iter->heap->sources[i]); + } + } + iter->heap->num_sources = 0; + } + + /* we build source list from cached memtable + cached SST sources (zero malloc on hot path) */ + const int total_sources = iter->num_cached_mt_sources + iter->num_cached_sources; + tidesdb_merge_source_t **temp_sources; + if (iter->temp_sources && iter->temp_sources_capacity >= total_sources) + { + temp_sources = (tidesdb_merge_source_t **)iter->temp_sources; + } + else + { + const int new_cap = + total_sources > TDB_STACK_ITER_SOURCES ? total_sources : TDB_STACK_ITER_SOURCES; + void **new_arr = realloc(iter->temp_sources, new_cap * sizeof(tidesdb_merge_source_t *)); + if (!new_arr) return TDB_ERR_MEMORY; + iter->temp_sources = new_arr; + iter->temp_sources_capacity = new_cap; + temp_sources = (tidesdb_merge_source_t **)new_arr; + } + + int temp_count = 0; + + /* we add cached memtable sources (no allocation -- just pointer copy) */ + for (int i = 0; i < iter->num_cached_mt_sources; i++) + { + temp_sources[temp_count++] = (tidesdb_merge_source_t *)iter->cached_mt_sources[i]; + } + + /* we add cached SST sources */ + for (int i = 0; i < iter->num_cached_sources; i++) + { + temp_sources[temp_count++] = (tidesdb_merge_source_t *)iter->cached_sources[i]; + } + + /* we reposition sources to target key */ + for (int i = 0; i < temp_count; i++) + { + tidesdb_merge_source_t *source = temp_sources[i]; + + /** on forward-monotonic seeks, if source already has a key >= target, + * it is still the correct first entry >= target. skip the expensive re-seek. */ + if (forward_monotonic && source->current_kv != NULL) + { + const int cmp = cmp_fn(source->current_kv->key, source->current_kv->entry.key_size, key, + key_size, cmp_ctx); + if (cmp >= 0) + { + tidesdb_merge_heap_add_source(iter->heap, source); + continue; + } + } + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (source->type == MERGE_SOURCE_MEMTABLE) + { + tidesdb_iter_seek_memtable_source(source, key, key_size, 1); + } + else if (source->type == MERGE_SOURCE_UNIFIED_MEMTABLE) + { + /* we build prefixed key and seek, then strip prefix via advance_to_cf */ + uint8_t pk_stack[TDB_PREFIXED_KEY_STACK_MAX]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + key_size; + uint8_t *pk = pk_total <= sizeof(pk_stack) ? pk_stack : malloc(pk_total); + if (pk) + { + tdb_build_prefixed_key(source->source.unified.cf_index, key, key_size, pk); + skip_list_cursor_t *cursor = source->source.unified.cursor; + if (skip_list_cursor_seek_ge(cursor, pk, pk_total) == 0) + { + tidesdb_unified_source_advance_to_cf(source, 1); + } + if (pk != pk_stack) free(pk); + } + } + else if (source->type == MERGE_SOURCE_BTREE) + { + tidesdb_iter_seek_btree_source_forward(source, key, key_size); + } + else if (source->type == MERGE_SOURCE_TXN_OPS) + { + tidesdb_iter_seek_txn_ops_source(source, key, key_size, 1); + } + else + { + tidesdb_iter_seek_sstable_source_forward(iter, source, key, key_size); + } + + if (source->current_kv != NULL) + { + tidesdb_merge_heap_add_source(iter->heap, source); + } + } + + return tidesdb_iter_find_visible_entry(iter, 1); +} + +int tidesdb_iter_seek_for_prev(tidesdb_iter_t *iter, const uint8_t *key, const size_t key_size) +{ + if (!iter || !key || key_size == 0) return TDB_ERR_INVALID_ARGS; + + /** a strictly-backward seek (new target < last result) lets sources with + * current_kv <= target keep their position. the comparison must be strict, + * iter_prev pops iter->current and advances its sources one entry past it, + * so a re-seek to exactly iter->current has to fall through and re-seek -- + * a <= test would skip it and return the preceding key. */ + int backward_monotonic = 0; + const skip_list_comparator_fn cmp_fn = iter->heap->comparator; + void *cmp_ctx = iter->heap->comparator_ctx; + + if (iter->valid && iter->direction == -1 && iter->current && cmp_fn) + { + const int cmp = + cmp_fn(key, key_size, iter->current->key, iter->current->entry.key_size, cmp_ctx); + if (cmp < 0) backward_monotonic = 1; + } + + tidesdb_kv_pair_free(iter->current); + iter->current = NULL; + iter->valid = 0; + iter->direction = -1; + + /* we only rebuild SST cache on initial build -- see tidesdb_iter_seek comment */ + if (iter->num_cached_sources == 0) + { + const int result = tidesdb_iter_rebuild_sst_cache(iter); + if (result != TDB_SUCCESS) return result; + } + else + { + /* we free non-cached sources that are currently in the heap */ + for (int i = 0; i < iter->heap->num_sources; i++) + { + if (!iter->heap->sources[i]->is_cached) + { + tidesdb_merge_source_free(iter->heap->sources[i]); + } + } + iter->heap->num_sources = 0; + } + + /* we build source list from cached memtable + cached SST sources (zero malloc on hot path) */ + const int total_sources = iter->num_cached_mt_sources + iter->num_cached_sources; + tidesdb_merge_source_t **temp_sources; + if (iter->temp_sources && iter->temp_sources_capacity >= total_sources) + { + temp_sources = (tidesdb_merge_source_t **)iter->temp_sources; + } + else + { + const int new_cap = + total_sources > TDB_STACK_ITER_SOURCES ? total_sources : TDB_STACK_ITER_SOURCES; + void **new_arr = realloc(iter->temp_sources, new_cap * sizeof(tidesdb_merge_source_t *)); + if (!new_arr) return TDB_ERR_MEMORY; + iter->temp_sources = new_arr; + iter->temp_sources_capacity = new_cap; + temp_sources = (tidesdb_merge_source_t **)new_arr; + } + + int temp_count = 0; + + for (int i = 0; i < iter->num_cached_mt_sources; i++) + { + temp_sources[temp_count++] = (tidesdb_merge_source_t *)iter->cached_mt_sources[i]; + } + + for (int i = 0; i < iter->num_cached_sources; i++) + { + temp_sources[temp_count++] = (tidesdb_merge_source_t *)iter->cached_sources[i]; + } + + /* we reposition sources to target key (backward) */ + for (int i = 0; i < temp_count; i++) + { + tidesdb_merge_source_t *source = temp_sources[i]; + + /** on backward-monotonic seeks, if source already has key <= target, + * it is still the correct last entry <= target. skip the expensive re-seek. */ + if (backward_monotonic && source->current_kv != NULL) + { + const int cmp = cmp_fn(source->current_kv->key, source->current_kv->entry.key_size, key, + key_size, cmp_ctx); + if (cmp <= 0) + { + tidesdb_merge_heap_add_source(iter->heap, source); + continue; + } + } + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (source->type == MERGE_SOURCE_MEMTABLE) + { + tidesdb_iter_seek_memtable_source(source, key, key_size, -1); + } + else if (source->type == MERGE_SOURCE_UNIFIED_MEMTABLE) + { + uint8_t pk_stack[TDB_PREFIXED_KEY_STACK_MAX]; + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + key_size; + uint8_t *pk = pk_total <= sizeof(pk_stack) ? pk_stack : malloc(pk_total); + if (pk) + { + tdb_build_prefixed_key(source->source.unified.cf_index, key, key_size, pk); + skip_list_cursor_t *cursor = source->source.unified.cursor; + if (skip_list_cursor_seek_for_prev(cursor, pk, pk_total) == 0) + { + tidesdb_unified_source_advance_to_cf(source, 0); + } + if (pk != pk_stack) free(pk); + } + } + else if (source->type == MERGE_SOURCE_BTREE) + { + tidesdb_iter_seek_btree_source_backward(source, key, key_size); + } + else if (source->type == MERGE_SOURCE_TXN_OPS) + { + tidesdb_iter_seek_txn_ops_source(source, key, key_size, -1); + } + else + { + tidesdb_iter_seek_sstable_source_backward(iter, source, key, key_size); + } + + if (source->current_kv != NULL) + { + tidesdb_merge_heap_add_source(iter->heap, source); + } + } + + return tidesdb_iter_find_visible_entry(iter, -1); +} + +int tidesdb_iter_seek_to_first(tidesdb_iter_t *iter) +{ + if (!iter) return TDB_ERR_INVALID_ARGS; + + /* we add any lazy cached SST sources (not yet in heap) so they participate */ + for (int ci = 0; ci < iter->num_cached_sources; ci++) + { + tidesdb_merge_source_t *cs = iter->cached_sources[ci]; + if (cs && cs->type == MERGE_SOURCE_SSTABLE && !cs->current_kv) + { + /* we position at first entry via block read */ + tidesdb_iter_release_sst_source_block(cs); + tidesdb_iter_clear_lazy(cs); + + tidesdb_sstable_t *sst = cs->source.sstable.sst; + block_manager_cursor_t *kc = cs->source.sstable.klog_cursor; + block_manager_cursor_goto_first(kc); + + if (sst->klog_data_end_offset == 0 || kc->current_pos < sst->klog_data_end_offset) + { + block_manager_block_t *block = tidesdb_read_block(cs->source.sstable.db, sst, kc); + if (block) + { + const uint8_t *data = block->data; + const size_t data_size = block->size; + tidesdb_klog_block_t *kb = NULL; + if (tidesdb_klog_block_deserialize(data, data_size, &kb, 0) == 0 && kb && + kb->num_entries > 0) + { + cs->source.sstable.current_block = kb; + cs->source.sstable.current_block_data = block; + cs->source.sstable.current_entry_idx = 0; + + const uint8_t *val = kb->inline_values[0]; + uint8_t *vv = NULL; + if (kb->entries[0].vlog_offset > 0) + { + tidesdb_vlog_read_value(cs->source.sstable.db, sst, + kb->entries[0].vlog_offset, + kb->entries[0].value_size, &vv); + val = vv; + } + cs->current_kv = tidesdb_kv_pair_create( + kb->keys[0], kb->entries[0].key_size, val, kb->entries[0].value_size, + kb->entries[0].ttl, kb->entries[0].seq, + kb->entries[0].flags & TDB_KV_TOMBSTONE_FLAG_MASK); + free(vv); + + if (cs->current_kv) tidesdb_merge_heap_add_source(iter->heap, cs); + } + else + { + if (kb) tidesdb_klog_block_free(kb); + block_manager_block_release(block); + } + } + } + } + } + + tidesdb_kv_pair_free(iter->current); + iter->current = NULL; + iter->valid = 0; + + while (!tidesdb_merge_heap_empty(iter->heap)) + { + tidesdb_kv_pair_t *kv = tidesdb_merge_heap_pop(iter->heap, NULL); + if (!kv) break; + + /* we check visibility (isolation, TTL, tombstones) */ + const int visible = tidesdb_iter_kv_visible(iter, kv); + if (visible == -1) + { + tidesdb_iter_skip_tombstone_versions(iter, kv, 1); + tidesdb_kv_pair_free(kv); + continue; + } + + if (visible == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + iter->current = kv; + iter->valid = 1; + iter->direction = 1; /* set forward direction */ + return TDB_SUCCESS; + } + + return TDB_ERR_NOT_FOUND; +} + +int tidesdb_iter_seek_to_last(tidesdb_iter_t *iter) +{ + if (!iter) return TDB_ERR_INVALID_ARGS; + + /****** we find the maximum key across all sources, then use seek_for_prev + ***** to position correctly. seek_for_prev aligns all sources at the + **** target key, ensuring tombstones from every source are visible. + *** this avoids where seek_to_last's pop loop over-retreats + ** a tombstone source, causing its tombstones to be missed when + * prev() later encounters the corresponding data entries. */ + + /* first, we find the max key by positioning all sources at their last entries */ + tidesdb_kv_pair_free(iter->current); + iter->current = NULL; + iter->valid = 0; + iter->direction = -1; + + const int total_sources = iter->heap->num_sources; + + /* also process cached SST sources not in the heap */ + for (int ci = 0; ci < iter->num_cached_sources; ci++) + { + tidesdb_merge_source_t *cs = iter->cached_sources[ci]; + /* we check if already in heap */ + int in_heap = 0; + for (int hi = 0; hi < total_sources; hi++) + { + if (iter->heap->sources[hi] == cs) + { + in_heap = 1; + break; + } + } + if (!in_heap && cs->type == MERGE_SOURCE_SSTABLE) + { + /* we add to heap so it gets positioned below */ + tidesdb_merge_heap_add_source(iter->heap, cs); + } + } + + for (int i = 0; i < iter->heap->num_sources; i++) + { + tidesdb_merge_source_t *source = iter->heap->sources[i]; + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = NULL; + + if (source->type == MERGE_SOURCE_MEMTABLE) + { + if (skip_list_cursor_goto_last(source->source.memtable.cursor) == 0) + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(source->source.memtable.cursor, &key, &key_size, + &value, &value_size, &ttl, &deleted, &seq) == 0) + { + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = + tidesdb_kv_pair_create(key, key_size, value, value_size, ttl, seq, deleted); + } + } + } + else if (source->type == MERGE_SOURCE_UNIFIED_MEMTABLE) + { + /** we seek to end of this CF's key range-- prefix with all-0xFF suffix. + * then scan backward to find first entry matching our prefix. */ + uint8_t end_prefix[TDB_UNIFIED_CF_PREFIX_SIZE]; + const uint32_t next_cf = source->source.unified.cf_index + 1; + tdb_encode_be32(next_cf, end_prefix); + if (skip_list_cursor_seek_for_prev(source->source.unified.cursor, end_prefix, + TDB_UNIFIED_CF_PREFIX_SIZE) == 0) + { + tidesdb_unified_source_advance_to_cf(source, 0); + } + } + else if (source->type == MERGE_SOURCE_BTREE) + { + if (btree_cursor_goto_last(source->source.btree.cursor) == 0) + { + uint8_t *key = NULL, *value = NULL; + size_t key_size = 0, value_size = 0; + uint64_t vlog_offset = 0, seq = 0; + int64_t ttl = 0; + uint8_t deleted = 0; + + if (btree_cursor_get(source->source.btree.cursor, &key, &key_size, &value, + &value_size, &vlog_offset, &seq, &ttl, &deleted) == 0) + { + const uint8_t *actual_value = value; + size_t actual_value_size = value_size; + uint8_t *vlog_value = NULL; + if (vlog_offset > 0) + { + if (tidesdb_btree_read_vlog_value(source->source.btree.vlog_cursor, + vlog_offset, source->config, &vlog_value, + &actual_value_size, value_size) == 0) + { + actual_value = vlog_value; + } + else + { + actual_value = NULL; + actual_value_size = 0; + } + } + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = tidesdb_kv_pair_create( + key, key_size, actual_value, actual_value_size, ttl, seq, deleted); + free(vlog_value); + } + } + } + else if (source->type == MERGE_SOURCE_TXN_OPS) + { + /* we position at the last entry in the sorted txn ops index */ + if (source->source.txn_ops.count > 0) + { + source->source.txn_ops.pos = source->source.txn_ops.count - 1; + const int op_idx = + source->source.txn_ops.sorted_indices[source->source.txn_ops.pos]; + const tidesdb_txn_op_t *op = &source->source.txn_ops.txn->ops[op_idx]; + + source->current_kv = + tidesdb_kv_pair_create(op->key, op->key_size, op->value, op->value_size, + op->ttl, UINT64_MAX, tidesdb_txn_op_kv_flags(op)); + } + } + else + { + /* klog sstable source */ + const uint64_t num_blocks = source->source.sstable.sst->num_klog_blocks; + block_manager_cursor_t *cursor = source->source.sstable.klog_cursor; + + if (num_blocks > 0) + { + /* footer-based O(1) seek to the last data block instead of + * walking every block forward -- the linear walk made + * seek_to_last cost scale with sstable size. the klog file + * appends bloom/index/metadata blocks after the data region, so + * we anchor at klog_data_end_offset rather than the file end. + * legacy sstables without that offset fall back to the walk */ + const uint64_t data_end = source->source.sstable.sst->klog_data_end_offset; + if (data_end > 0) + { + block_manager_cursor_goto_last_before(cursor, data_end); + } + else if (block_manager_cursor_goto_first(cursor) == 0) + { + for (uint64_t b = 1; b < num_blocks; b++) + { + if (block_manager_cursor_next(cursor) != 0) break; + } + } + + /* we clean up old data from iterator creation before reading new block */ + tidesdb_iter_release_sst_source_block(source); + + block_manager_block_t *block = + block_manager_cursor_read(source->source.sstable.klog_cursor); + if (block) + { + const uint8_t *data = block->data; + size_t data_size = block->size; + uint8_t *decompressed = NULL; + + if (source->config->compression_algorithm != TDB_COMPRESS_NONE) + { + size_t decompressed_size; + decompressed = decompress_data(block->data, block->size, &decompressed_size, + source->config->compression_algorithm); + if (decompressed) + { + data = decompressed; + data_size = decompressed_size; + /* we keep decompressed buffer, deserialized pointers reference it */ + source->source.sstable.decompressed_data = decompressed; + } + } + + if (tidesdb_klog_block_deserialize( + data, data_size, &source->source.sstable.current_block, 1) == 0) + { + if (source->source.sstable.current_block->num_entries > 0) + { + /* the deserialization succeeded, its now safe to store block */ + source->source.sstable.current_block_data = block; + + /* last entry in last block */ + const int idx = + (int)source->source.sstable.current_block->num_entries - 1; + source->source.sstable.current_entry_idx = idx; + + tidesdb_klog_block_t *kb = source->source.sstable.current_block; + const uint8_t *value = kb->inline_values[idx]; + + uint8_t *vlog_value = NULL; + if (kb->entries[idx].vlog_offset > 0) + { + tidesdb_vlog_read_value(source->source.sstable.db, + source->source.sstable.sst, + kb->entries[idx].vlog_offset, + kb->entries[idx].value_size, &vlog_value); + value = vlog_value; + } + + tidesdb_kv_pair_free(source->current_kv); + source->current_kv = tidesdb_kv_pair_create( + kb->keys[idx], kb->entries[idx].key_size, value, + kb->entries[idx].value_size, kb->entries[idx].ttl, + kb->entries[idx].seq, + kb->entries[idx].flags & TDB_KV_TOMBSTONE_FLAG_MASK); + + free(vlog_value); + } + else + { + /* empty block, release it */ + block_manager_block_release(block); + } + } + else + { + /* deserialization failed! we must release block */ + block_manager_block_release(block); + } + + /** we dont free decompressed or release block if we're still using the + * deserialized data (stored in current_block_data) */ + } + } + } + } + + /* we find the max key across all sources from the heap */ + for (int i = (iter->heap->num_sources / 2) - 1; i >= 0; i--) + { + heap_sift_down_max(iter->heap, i); + } + + /* we get the max key from the heap top */ + if (iter->heap->num_sources == 0 || !iter->heap->sources[0]->current_kv) + { + return TDB_ERR_NOT_FOUND; + } + + const tidesdb_kv_pair_t *max_kv = iter->heap->sources[0]->current_kv; + const size_t max_key_size = max_kv->entry.key_size; + + /*** we copy the max key to a local buffer before calling seek_for_prev. + ** seek_for_prev frees source->current_kv (which is max_kv), so the + * pointer would dangle if we passed it directly. */ + uint8_t key_stack[TDB_ITER_STACK_KEY_SIZE]; + uint8_t *max_key_copy = + max_key_size <= sizeof(key_stack) ? key_stack : (uint8_t *)malloc(max_key_size); + if (!max_key_copy) return TDB_ERR_MEMORY; + memcpy(max_key_copy, max_kv->key, max_key_size); + + /**** we delegate to seek_for_prev which positions all sources at max_key + *** and handles tombstone visibility correctly across all sources. + ** this avoids the bug where the pop loop over-retreats a tombstone + * source, causing its tombstones to be missed during prev(). */ + const int result = tidesdb_iter_seek_for_prev(iter, max_key_copy, max_key_size); + + if (max_key_copy != key_stack) free(max_key_copy); + return result; +} + +int tidesdb_iter_next(tidesdb_iter_t *iter) +{ + if (!iter) return TDB_ERR_INVALID_ARGS; + if (!iter->valid) return TDB_ERR_INVALID_ARGS; + + /* we toggle pop buffer slot so new pops write to a different + * buffer than the previous iter->current (avoids clobbering prev) */ + iter->heap->pop_buf_slot ^= 1; + + /* we check if direction changed from backward to forward */ + const int direction_changed = (iter->direction == -1); + + /* we set direction to forward */ + iter->direction = 1; + + /***** we keep previous entry alive for duplicate detection instead + **** of copying its key into a separate buffer. This avoids a memcpy (and + *** potential malloc for keys > TDB_ITER_STACK_KEY_SIZE) per iter_next call. + ** prev is freed once we find the next visible entry or at end-of-scan. */ + tidesdb_kv_pair_t *prev = iter->current; + iter->current = NULL; + iter->valid = 0; + + /* if direction changed, we advance all sources and rebuild as min-heap */ + if (direction_changed) + { + for (int i = 0; i < iter->heap->num_sources; i++) + { + tidesdb_merge_source_t *source = iter->heap->sources[i]; + if (tidesdb_merge_source_advance(source) != TDB_SUCCESS) + { + source->current_kv = NULL; + } + } + + /* we rebuild as min-heap for forward iteration */ + for (int i = (iter->heap->num_sources / 2) - 1; i >= 0; i--) + { + heap_sift_down(iter->heap, i); + } + } + + /* we pop from heap until we find next visible entry */ + while (!tidesdb_merge_heap_empty(iter->heap)) + { + tidesdb_kv_pair_t *kv = tidesdb_merge_heap_pop(iter->heap, NULL); + if (!kv) break; + + /* we skip duplicates (same key as previous) */ + if (prev && prev->entry.key_size == kv->entry.key_size && + memcmp(prev->key, kv->key, prev->entry.key_size) == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + const int visible = tidesdb_iter_kv_visible(iter, kv); + if (visible == -1) + { + tidesdb_iter_skip_tombstone_versions(iter, kv, 1); + tidesdb_kv_pair_free(kv); + continue; + } + + if (visible == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + /* we only track reads for isolation levels that need conflict detection + * (REPEATABLE_READ and SERIALIZABLE). for READ_COMMITTED and below the + * function would just early-exit, but skipping the call entirely avoids + * the overhead of alot of function calls during a full scan. */ + if (iter->txn->isolation_level >= TDB_ISOLATION_REPEATABLE_READ) + { + tidesdb_txn_add_to_read_set(iter->txn, iter->cf, kv->key, kv->entry.key_size, + kv->entry.seq); + } + + tidesdb_kv_pair_free(prev); + iter->current = kv; + iter->valid = 1; + return TDB_SUCCESS; + } + + tidesdb_kv_pair_free(prev); + return TDB_ERR_NOT_FOUND; +} + +int tidesdb_iter_prev(tidesdb_iter_t *iter) +{ + if (!iter) return TDB_ERR_INVALID_ARGS; + if (!iter->valid) return TDB_ERR_INVALID_ARGS; + + /* we toggle pop buffer slot so new pops write to a different + * buffer than the previous iter->current (avoids clobbering prev) */ + iter->heap->pop_buf_slot ^= 1; + + /* we check if direction changed from forward to backward */ + const int direction_changed = (iter->direction == 1); + + /* we set direction to backward */ + iter->direction = -1; + + /* we keep previous entry alive for duplicate detection (same as iter_next) */ + tidesdb_kv_pair_t *prev = iter->current; + iter->current = NULL; + iter->valid = 0; + + /* if direction changed, we retreat all sources and rebuild as max-heap */ + if (direction_changed) + { + for (int i = 0; i < iter->heap->num_sources; i++) + { + tidesdb_merge_source_t *source = iter->heap->sources[i]; + if (tidesdb_merge_source_retreat(source) != TDB_SUCCESS) + { + source->current_kv = NULL; + } + } + + /* we rebuild as max-heap for backward iteration */ + for (int i = (iter->heap->num_sources / 2) - 1; i >= 0; i--) + { + heap_sift_down_max(iter->heap, i); + } + } + + /* we pop from max-heap until we find previous visible entry */ + while (!tidesdb_merge_heap_empty(iter->heap)) + { + tidesdb_kv_pair_t *kv = tidesdb_merge_heap_pop_max(iter->heap); + if (!kv) break; + + /* we skip duplicates (same key as previous) */ + if (prev && prev->entry.key_size == kv->entry.key_size && + memcmp(prev->key, kv->key, prev->entry.key_size) == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + /* we skip invisible entries */ + const int visible = tidesdb_iter_kv_visible(iter, kv); + if (visible == -1) + { + tidesdb_iter_skip_tombstone_versions(iter, kv, -1); + tidesdb_kv_pair_free(kv); + continue; + } + + if (visible == 0) + { + tidesdb_kv_pair_free(kv); + continue; + } + + /* we only track reads for REPEATABLE_READ and SERIALIZABLE */ + if (iter->txn->isolation_level >= TDB_ISOLATION_REPEATABLE_READ) + { + tidesdb_txn_add_to_read_set(iter->txn, iter->cf, kv->key, kv->entry.key_size, + kv->entry.seq); + } + + tidesdb_kv_pair_free(prev); + iter->current = kv; + iter->valid = 1; + return TDB_SUCCESS; + } + + tidesdb_kv_pair_free(prev); + return TDB_ERR_NOT_FOUND; +} + +int tidesdb_iter_valid(tidesdb_iter_t *iter) +{ + if (!iter) return 0; + return iter->valid; +} + +int tidesdb_iter_key(tidesdb_iter_t *iter, uint8_t **key, size_t *key_size) +{ + if (!iter || !key || !key_size) return TDB_ERR_INVALID_ARGS; + if (!iter->valid || !iter->current) return TDB_ERR_INVALID_ARGS; + + *key = iter->current->key; + *key_size = iter->current->entry.key_size; + + return TDB_SUCCESS; +} + +int tidesdb_iter_value(tidesdb_iter_t *iter, uint8_t **value, size_t *value_size) +{ + if (!iter || !value || !value_size) return TDB_ERR_INVALID_ARGS; + if (!iter->valid || !iter->current) return TDB_ERR_INVALID_ARGS; + + *value = iter->current->value; + *value_size = iter->current->entry.value_size; + + return TDB_SUCCESS; +} + +int tidesdb_iter_key_value(tidesdb_iter_t *iter, uint8_t **key, size_t *key_size, uint8_t **value, + size_t *value_size) +{ + if (!iter || !key || !key_size || !value || !value_size) return TDB_ERR_INVALID_ARGS; + if (!iter->valid || !iter->current) return TDB_ERR_INVALID_ARGS; + + *key = iter->current->key; + *key_size = iter->current->entry.key_size; + *value = iter->current->value; + *value_size = iter->current->entry.value_size; + + return TDB_SUCCESS; +} + +void tidesdb_iter_free(tidesdb_iter_t *iter) +{ + if (!iter) return; + + tidesdb_kv_pair_free(iter->current); + tidesdb_merge_heap_free(iter->heap); + + if (iter->cached_sources) + { + for (int i = 0; i < iter->num_cached_sources; i++) + { + tidesdb_merge_source_free(iter->cached_sources[i]); + } + free(iter->cached_sources); + } + + if (iter->cached_mt_sources) + { + for (int i = 0; i < iter->num_cached_mt_sources; i++) + { + tidesdb_merge_source_free(iter->cached_mt_sources[i]); + } + free(iter->cached_mt_sources); + } + + free(iter->temp_sources); + free(iter); +} + +/** + * tidesdb_sort_wal_files + * sort WAL files by ID + * @param wal_files queue of WAL file paths + */ +static void tidesdb_sort_wal_files(queue_t *wal_files) +{ + const size_t wal_count = queue_size(wal_files); + if (wal_count <= 1) return; + + char **wal_array = malloc(wal_count * sizeof(char *)); + if (!wal_array) return; + + for (size_t i = 0; i < wal_count; i++) + { + wal_array[i] = queue_dequeue(wal_files); + } + + for (size_t i = 0; i < wal_count - 1; i++) + { + for (size_t j = 0; j < wal_count - i - 1; j++) + { + uint64_t id1 = 0, id2 = 0; + const char *name1 = strrchr(wal_array[j], PATH_SEPARATOR[0]); + const char *name2 = strrchr(wal_array[j + 1], PATH_SEPARATOR[0]); + if (name1) + name1++; + else + name1 = wal_array[j]; + if (name2) + name2++; + else + name2 = wal_array[j + 1]; + + tdb_parse_wal_id(name1, &id1); + tdb_parse_wal_id(name2, &id2); + + if (id1 > id2) + { + char *temp = wal_array[j]; + wal_array[j] = wal_array[j + 1]; + wal_array[j + 1] = temp; + } + } + } + + for (size_t i = 0; i < wal_count; i++) + { + queue_enqueue(wal_files, wal_array[i]); + } + free(wal_array); +} + +/** + * tidesdb_recover_single_wal + * recover a single WAL file and queue for flush + * @param cf column family + * @param wal_path path to WAL file (ownership transferred, will be freed) + */ +static void tidesdb_recover_single_wal(tidesdb_column_family_t *cf, char *wal_path) +{ + skip_list_t *recovered_memtable = NULL; + const int recover_result = tidesdb_wal_recover(cf, wal_path, &recovered_memtable); + + if (recover_result != TDB_SUCCESS || !recovered_memtable) + { + if (recovered_memtable) skip_list_free(recovered_memtable); + free(wal_path); + return; + } + + const int recovered_entries = skip_list_count_entries(recovered_memtable); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' recovered memtable from WAL %s (%d entries)", cf->name, + wal_path, recovered_entries); + + if (recovered_entries == 0) + { + skip_list_free(recovered_memtable); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' empty recovered memtable, deleting WAL, %s", cf->name, + wal_path); + tdb_unlink(wal_path); + free(wal_path); + return; + } + + block_manager_t *wal_bm = NULL; + if (block_manager_open(&wal_bm, wal_path, TDB_SYNC_FULL) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to reopen WAL for flush tracking, %s", cf->name, + wal_path); + skip_list_free(recovered_memtable); + free(wal_path); + return; + } + + tidesdb_immutable_memtable_t *imm = calloc(1, sizeof(tidesdb_immutable_memtable_t)); + if (!imm) + { + block_manager_close(wal_bm); + skip_list_free(recovered_memtable); + free(wal_path); + return; + } + + imm->skip_list = recovered_memtable; + imm->wal = wal_bm; + imm->id = 0; + imm->generation = 0; + atomic_init(&imm->refcount, 1); + atomic_init(&imm->writers, 0); + atomic_init(&imm->flushed, 0); + + if (queue_enqueue(cf->immutable_memtables, imm) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to enqueue recovered memtable", cf->name); + (void)tidesdb_immutable_memtable_unref(imm); + free(wal_path); + return; + } + + /* we publish lock-free snapshot so readers see the recovered immutable */ + (void)tidesdb_imm_snap_publish(cf); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' has queued recovered memtable for async flush (WAL: %s)", + cf->name, wal_path); + + tidesdb_flush_work_t *work = malloc(sizeof(tidesdb_flush_work_t)); + if (work) + { + work->cf = cf; + work->imm = imm; + work->sst_id = atomic_fetch_add_explicit(&cf->next_sstable_id, 1, memory_order_relaxed); + work->unified_sl = NULL; + work->unified_barrier = NULL; + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' allocated SSTable ID %" PRIu64 " for recovered WAL flush", cf->name, + work->sst_id); + tidesdb_immutable_memtable_ref(imm); + + atomic_fetch_add_explicit(&cf->db->flush_pending_count, 1, memory_order_release); + atomic_fetch_add_explicit(&cf->flush_pending_count, 1, memory_order_release); + + if (queue_enqueue(cf->db->flush_queue, work) != 0) + { + atomic_fetch_sub_explicit(&cf->db->flush_pending_count, 1, memory_order_release); + atomic_fetch_sub_explicit(&cf->flush_pending_count, 1, memory_order_release); + tidesdb_immutable_memtable_unref(imm); + free(work); + } + } + + free(wal_path); +} + +/** + * tidesdb_recover_wals + * discover and recover all WAL files for a column family + * @param cf column family + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_recover_wals(tidesdb_column_family_t *cf) +{ + DIR *dir = opendir(cf->directory); + if (!dir) return TDB_ERR_IO; + + queue_t *wal_files = queue_new(); + if (!wal_files) + { + closedir(dir); + return TDB_ERR_MEMORY; + } + + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) + { + if (strstr(entry->d_name, TDB_WAL_PREFIX) == entry->d_name) + { + const size_t path_len = strlen(cf->directory) + strlen(entry->d_name) + 2; + char *wal_path = malloc(path_len); + if (wal_path) + { + snprintf(wal_path, path_len, "%s" PATH_SEPARATOR "%s", cf->directory, + entry->d_name); + if (queue_enqueue(wal_files, wal_path) != 0) + { + free(wal_path); + } + } + } + } + closedir(dir); + + /* we restore next_sstable_id from manifest before WAL recovery */ + const uint64_t manifest_seq = atomic_load(&cf->manifest->sequence); + if (cf->manifest && manifest_seq > 0) + { + atomic_store_explicit(&cf->next_sstable_id, manifest_seq, memory_order_relaxed); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' pre-loaded next_sstable_id=%" PRIu64 + " from manifest before WAL recovery", + cf->name, manifest_seq); + } + + tidesdb_sort_wal_files(wal_files); + + /* create_column_family adopted the highest-id wal as the active memtable's + * wal (already open + validated). recovery replays that one in place into + * the live active skip list and leaves the file alone; the lower-id wals + * are immutables, recovered and flushed the usual way. */ + tidesdb_memtable_t *active_mt = + atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + const uint64_t active_wal_id = active_mt ? active_mt->id : 0; + + while (!queue_is_empty(wal_files)) + { + char *wal_path = queue_dequeue(wal_files); + if (!wal_path) continue; + + const char *wal_name = strrchr(wal_path, PATH_SEPARATOR[0]); + wal_name = wal_name ? wal_name + 1 : wal_path; + uint64_t wid = 0; + const int parsed = tdb_parse_wal_id(wal_name, &wid); + + if (parsed && active_mt && active_mt->wal && wid == active_wal_id) + { + /* the active memtable's own wal -- replay in place, keep the file */ + const int rc = tidesdb_wal_replay_into(cf, active_mt->wal, active_mt->skip_list); + if (rc != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "CF '%s' failed to replay active WAL %s (error %d)", + cf->name, wal_path, rc); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' replayed active WAL %s into active memtable (%d entries)", + cf->name, wal_path, skip_list_count_entries(active_mt->skip_list)); + } + free(wal_path); + } + else + { + tidesdb_recover_single_wal(cf, wal_path); + } + } + + /* keep the shared sstable/wal id space monotonic past the active wal so a + * later rotation cannot allocate wal_ and truncate the live + * active wal (rotation derives the new wal id from next_sstable_id) */ + { + uint64_t cur = atomic_load_explicit(&cf->next_sstable_id, memory_order_relaxed); + if (cur < active_wal_id + 1) + { + atomic_store_explicit(&cf->next_sstable_id, active_wal_id + 1, memory_order_relaxed); + } + } + + queue_free(wal_files); + return TDB_SUCCESS; +} + +/** + * tidesdb_recover_single_sstable + * recover a single sstable from disk + * @param cf column family + * @param entry directory entry for the .klog file + */ +static void tidesdb_recover_single_sstable(tidesdb_column_family_t *cf, const struct dirent *entry) +{ + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' found .klog file %s", cf->name, entry->d_name); + + int level_num = 1; + int partition_num = -1; + unsigned long long sst_id_ull = 0; + char sst_base[TDB_MAX_PATH_LEN]; + int parsed = 0; + + /* we try parsing partitioned format first -- L{level}P{partition}_{id}.klog */ + if (tdb_parse_sstable_partitioned(entry->d_name, &level_num, &partition_num, &sst_id_ull)) + { + snprintf(sst_base, sizeof(sst_base), + "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d" TDB_LEVEL_PARTITION_PREFIX "%d", + cf->directory, level_num, partition_num); + parsed = 1; + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Parsed partitioned SSTable level=%d, partition=%d, id=%" PRIu64, level_num, + partition_num, (uint64_t)sst_id_ull); + } + /* we try non-partitioned format-- L{level}_{id}.klog */ + else if (tdb_parse_sstable_non_partitioned(entry->d_name, &level_num, &sst_id_ull)) + { + snprintf(sst_base, sizeof(sst_base), "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d", + cf->directory, level_num); + parsed = 1; + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' parsed non-partitioned SSTable level=%d, id=%" PRIu64, + cf->name, level_num, (uint64_t)sst_id_ull); + } + + if (!parsed) return; + + const uint64_t sst_id = (uint64_t)sst_id_ull; + + /* we check manifest to see if this sstable is complete */ + const int in_manifest = tidesdb_manifest_has_sstable(cf->manifest, level_num, sst_id); + + if (!in_manifest) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' SSTable %" PRIu64 + " at level %d not in manifest, deleting (incomplete write)", + cf->name, sst_id, level_num); + + char klog_path[TDB_MAX_PATH_LEN]; + char vlog_path[TDB_MAX_PATH_LEN]; +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif + snprintf(klog_path, sizeof(klog_path), "%s_%" PRIu64 TDB_SSTABLE_KLOG_EXT, sst_base, + sst_id); + snprintf(vlog_path, sizeof(vlog_path), "%s_%" PRIu64 TDB_SSTABLE_VLOG_EXT, sst_base, + sst_id); +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + tdb_unlink(klog_path); + tdb_unlink(vlog_path); + return; + } + + tidesdb_sstable_t *sst = tidesdb_sstable_create(cf->db, sst_base, sst_id, &cf->config); + if (!sst) return; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' is recovering SSTable %" PRIu64 " at level %d", cf->name, + sst_id, level_num); + + if (tidesdb_sstable_load(cf->db, sst) != TDB_SUCCESS) + { + /* this sstable is referenced by the manifest, so its files are kept on + * disk rather than deleted -- a load failure can come from a write side + * bug as readily as from genuine media corruption, and deleting + * manifest referenced data would turn a repairable fault into permanent + * loss. the sstable is skipped, so its keys are absent from this open + * until the files are repaired, and the loud log surfaces the fault. + * sst is not marked_for_deletion, so the unref frees only the struct. */ + TDB_DEBUG_LOG(TDB_LOG_ERROR, + "CF '%s' SSTable %" PRIu64 + " at level %d is referenced by the manifest but " + "failed to load -- keeping its files on disk and skipping it", + cf->name, sst_id, level_num); + + tidesdb_sstable_unref(cf->db, sst); + return; + } + + int current_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + while (current_levels < level_num) + { + if (tidesdb_add_level(cf) != TDB_SUCCESS) break; + current_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + } + + if (level_num <= current_levels) + { + tidesdb_level_add_sstable(cf->levels[level_num - 1], sst); + tidesdb_bump_sstable_layout_version(cf); + } + + tidesdb_sstable_unref(cf->db, sst); +} + +/** + * sstable_cmp_by_id + * qsort comparator for sorting sstables by id ascending + * @param a pointer to first sstable pointer + * @param b pointer to second sstable pointer + * @return negative if a < b, 0 if equal, positive if a > b + */ +static int sstable_cmp_by_id(const void *a, const void *b) +{ + const tidesdb_sstable_t *sa = *(const tidesdb_sstable_t *const *)a; + const tidesdb_sstable_t *sb = *(const tidesdb_sstable_t *const *)b; + if (sa->id < sb->id) return -1; + if (sa->id > sb->id) return 1; + return 0; +} + +/** + * tidesdb_recover_sstables + * discovers and recovers all sstables for a column family from disk + * sorts level 0 by id after recovery to restore newest-at-highest-index invariant + * @param cf column family + * @return TDB_SUCCESS or error code + */ +static int tidesdb_recover_sstables(tidesdb_column_family_t *cf) +{ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Recovering SSTables from directory %s", cf->directory); + + int local_sst_count = 0; + DIR *dir = opendir(cf->directory); + if (!dir) return TDB_ERR_IO; + + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) + { + if (strstr(entry->d_name, TDB_SSTABLE_KLOG_EXT) != NULL) + { + tidesdb_recover_single_sstable(cf, entry); + local_sst_count++; + } + } + closedir(dir); + + /*** if no local .klog files were found but the MANIFEST + ** has sstable entries, we reconstruct sstable structs from MANIFEST metadata. + * the actual .klog/.vlog files will be downloaded on demand via ensure_open. */ + if (local_sst_count == 0 && cf->db && cf->db->object_store && cf->manifest) + { + const int manifest_count = cf->manifest->num_entries; + if (manifest_count > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' cold start reconstructing %d SSTables from MANIFEST", cf->name, + manifest_count); + + /*** the freshly created CF only has its initial levels, but the + ** MANIFEST can reference deeper ones produced by compaction. we + * materialise every level up to the deepest the MANIFEST names + ** before adding sstables */ + int max_manifest_level = 1; + for (int i = 0; i < manifest_count; i++) + { + if (cf->manifest->entries[i].level > max_manifest_level) + max_manifest_level = cf->manifest->entries[i].level; + } + for (int lvl = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + lvl < max_manifest_level && lvl < TDB_MAX_LEVELS; lvl++) + { + const size_t lvl_capacity = tidesdb_calculate_level_capacity( + lvl + 1, cf->config.write_buffer_size, cf->config.level_size_ratio); + cf->levels[lvl] = tidesdb_level_create(lvl + 1, lvl_capacity); + if (!cf->levels[lvl]) break; + atomic_store_explicit(&cf->num_active_levels, lvl + 1, memory_order_release); + } + + for (int i = 0; i < manifest_count; i++) + { + tidesdb_manifest_entry_t *me = &cf->manifest->entries[i]; + + /* we construct sst path from level + id */ + char sst_base[MAX_FILE_PATH_LENGTH]; + snprintf(sst_base, sizeof(sst_base), "%s" PATH_SEPARATOR TDB_LEVEL_PREFIX "%d", + cf->directory, me->level); + + tidesdb_sstable_t *sst = + tidesdb_sstable_create(cf->db, sst_base, me->id, &cf->config); + if (!sst) continue; + + sst->num_entries = me->num_entries; + sst->klog_size = me->size_bytes; + sst->db = cf->db; + + /**** we download sst files from object store via ensure_open, then close + *** the block managers it opened since sstable_load opens its own. + ** without this close, load overwrites sst->klog_bm/vlog_bm with its + * own local BMs, leaking the ensure_open allocations. */ + if (tidesdb_sstable_ensure_open(cf->db, sst) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' cold start SSTable %d (L%d) not available in " + "object store, skipping (partial upload?)", + cf->name, (int)me->id, me->level); + tidesdb_sstable_unref(cf->db, sst); + continue; + } + + { + /* we must close BMs from ensure_open before load opens its own */ + if (sst->klog_bm) + { + block_manager_close(sst->klog_bm); + sst->klog_bm = NULL; + } + if (sst->vlog_bm) + { + block_manager_close(sst->vlog_bm); + sst->vlog_bm = NULL; + } + atomic_fetch_sub(&cf->db->num_open_sstables, 1); + + tidesdb_sstable_load(cf->db, sst); + } + + /* we ensure level exists */ + int level_idx = me->level - 1; + if (level_idx >= 0 && level_idx < atomic_load(&cf->num_active_levels) && + cf->levels[level_idx]) + { + tidesdb_level_add_sstable(cf->levels[level_idx], sst); + + /* we update next_sstable_id to avoid collisions */ + uint64_t cur_next = + atomic_load_explicit(&cf->next_sstable_id, memory_order_relaxed); + if (me->id >= cur_next) + { + atomic_store_explicit(&cf->next_sstable_id, me->id + 1, + memory_order_relaxed); + } + } + + tidesdb_sstable_unref(cf->db, sst); + } + } + } + + /**** we sort level 0 sstables by ID so newer sstables (higher ID) are at higher + *** array indices -- tidesdb_txn_get searches level 0 in reverse order + ** and returns on the first match, so the ordering is critical for + * correctness after recovery where readdir() order is non-deterministic */ + tidesdb_level_t *l0 = cf->levels[0]; + if (l0) + { + tidesdb_sstable_t **arr = atomic_load_explicit(&l0->sstables, memory_order_acquire); + int n = atomic_load_explicit(&l0->num_sstables, memory_order_acquire); + if (arr && n > 1) + { + qsort(arr, n, sizeof(tidesdb_sstable_t *), sstable_cmp_by_id); + } + } + + return TDB_SUCCESS; +} + +/** + * tidesdb_scan_max_sequence + * scan all sources (sstables and immutable memtables) for max sequence number + * @param cf column family + * @return maximum sequence number found + */ +static uint64_t tidesdb_scan_max_sequence(tidesdb_column_family_t *cf) +{ + uint64_t global_max_seq = 0; + + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' is scanning sources for max_seq", cf->name); + + for (int level_idx = 0; level_idx < num_levels; level_idx++) + { + tidesdb_level_t *level = cf->levels[level_idx]; + if (!level) continue; + + tidesdb_sstable_t **sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + int num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + const int num_ssts_recheck = + atomic_load_explicit(&level->num_sstables, memory_order_acquire); + if (num_ssts_recheck < num_ssts) num_ssts = num_ssts_recheck; + + tidesdb_sstable_t **sstables_check = + atomic_load_explicit(&level->sstables, memory_order_acquire); + if (sstables_check != sstables) + { + sstables = sstables_check; + num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + } + + for (int sst_idx = 0; sst_idx < num_ssts; sst_idx++) + { + tidesdb_sstable_t *sst = sstables[sst_idx]; + if (sst && sst->max_seq > global_max_seq) + { + global_max_seq = sst->max_seq; + } + } + } + + /* we scan immutable memtables */ + if (cf->immutable_memtables) + { + const size_t imm_count = queue_size(cf->immutable_memtables); + /* a stack buffer covers the realistic recovery case (a handful of immutables) so the + * max-seq scan never depends on a heap alloc; only an unusually deep queue mallocs. */ + void *imm_stack[TDB_RECOVER_IMM_SCAN_STACK]; + void **imm_snap = NULL; + size_t imm_snap_count = 0; + + if (imm_count > 0) + { + imm_snap = (imm_count <= TDB_RECOVER_IMM_SCAN_STACK) + ? imm_stack + : malloc(imm_count * sizeof(void *)); + if (imm_snap) + imm_snap_count = queue_snapshot(cf->immutable_memtables, imm_snap, imm_count); + else + /* could not snapshot a deep immutable queue under memory pressure -- surface it, + * since skipping immutables here could under-seed global_seq on recovery */ + TDB_DEBUG_LOG(TDB_LOG_WARN, + "CF '%s' max-seq scan skipped %zu immutables (snapshot alloc failed)", + cf->name, imm_count); + } + + for (size_t i = 0; i < imm_snap_count; i++) + { + tidesdb_immutable_memtable_t *imm = (tidesdb_immutable_memtable_t *)imm_snap[i]; + if (!imm || !imm->skip_list) continue; + + skip_list_cursor_t *cursor; + if (skip_list_cursor_init(&cursor, imm->skip_list) != 0) continue; + + if (skip_list_cursor_goto_first(cursor) == 0) + { + do + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &key, &key_size, &value, &value_size, + &ttl, &deleted, &seq) == 0) + { + if (seq > global_max_seq) + { + global_max_seq = seq; + } + } + } while (skip_list_cursor_next(cursor) == 0); + } + skip_list_cursor_free(cursor); + } + + if (imm_snap != imm_stack) free(imm_snap); + } + + /* we scan the active memtable -- crash recovery replays the adopted active + * wal in place into it, so its entries' seqs would otherwise be invisible to + * the max-seq scan (they never pass through an sstable or immutable) */ + tidesdb_memtable_t *active_mt = + atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + if (active_mt && active_mt->skip_list) + { + skip_list_cursor_t *cursor; + if (skip_list_cursor_init(&cursor, active_mt->skip_list) == 0) + { + if (skip_list_cursor_goto_first(cursor) == 0) + { + do + { + uint8_t *key, *value; + size_t key_size, value_size; + int64_t ttl; + uint8_t deleted; + uint64_t seq; + + if (skip_list_cursor_get_with_seq(cursor, &key, &key_size, &value, &value_size, + &ttl, &deleted, &seq) == 0) + { + if (seq > global_max_seq) + { + global_max_seq = seq; + } + } + } while (skip_list_cursor_next(cursor) == 0); + } + skip_list_cursor_free(cursor); + } + } + + return global_max_seq; +} + +/** + * tidesdb_recover_column_family + * recover a column family from disk after crash + * @param cf + * @return error code + */ +static int tidesdb_recover_column_family(tidesdb_column_family_t *cf) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + int result = tidesdb_recover_wals(cf); + if (result != TDB_SUCCESS) return result; + + result = tidesdb_recover_sstables(cf); + if (result != TDB_SUCCESS) return result; + + const uint64_t global_max_seq = tidesdb_scan_max_sequence(cf); + + /* we update global sequence based on recovered data */ + const uint64_t current_seq = atomic_load_explicit(&cf->db->global_seq, memory_order_acquire); + if (global_max_seq >= current_seq) + { + atomic_store(&cf->db->global_seq, global_max_seq + 1); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' has updated global_seq from %" PRIu64 " to %" PRIu64, + cf->name, current_seq, global_max_seq + 1); + } + + /* we update commit status */ + if (global_max_seq > 0) + { + tidesdb_commit_status_t *cs = cf->db->commit_status; + + const uint64_t current_max = atomic_load_explicit(&cs->max_seq, memory_order_acquire); + if (global_max_seq > current_max) + { + atomic_store_explicit(&cs->max_seq, global_max_seq, memory_order_release); + } + + /* the commit status is a ring of cs->capacity slots, so only the last + * capacity sequence numbers are distinguishable -- writing every seq + * from 1 makes recovery scale with the database's lifetime write count + * instead of the ring size */ + uint64_t status_start = 1; + if (global_max_seq > (uint64_t)cs->capacity) + status_start = global_max_seq - (uint64_t)cs->capacity + 1; + for (uint64_t seq = status_start; seq <= global_max_seq; seq++) + { + const size_t idx = seq % cs->capacity; + atomic_store_explicit(&cs->status[idx], TDB_COMMIT_STATUS_COMMITTED, + memory_order_release); + } + } + + /* we restore next_sstable_id from manifest to prevent ID collisions */ + if (cf->manifest) + { + const uint64_t manifest_seq = atomic_load(&cf->manifest->sequence); + if (manifest_seq > atomic_load(&cf->next_sstable_id)) + { + atomic_store(&cf->next_sstable_id, manifest_seq); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' restored next_sstable_id=%" PRIu64 " from manifest", cf->name, + manifest_seq); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' recovery is complete, global_max_seq=%" PRIu64, cf->name, + global_max_seq); + + return TDB_SUCCESS; +} + +/** + * tidesdb_unified_wal_replay_into + * replays one already-open, already-validated unified WAL into target. unified + * WAL entry format -- [cf_index BE32][flags][varint key_size][varint value_size] + * [varint seq][ttl(8)?][key][value] -- replayed as prefixed keys [cf_index][key] + * into the shared skip list. updates *max_seq and *total_entries; advances + * unified_mt.next_cf_index past any cf_index seen. the caller owns the wal + * block manager lifecycle. + * @param db database instance + * @param wal an open, validated unified WAL block manager + * @param target the unified skip list to replay into + * @param max_seq updated with the highest seq seen + * @param total_entries incremented per replayed entry + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_unified_wal_replay_into(tidesdb_t *db, block_manager_t *wal, skip_list_t *target, + uint64_t *max_seq, int *total_entries) +{ + block_manager_cursor_t *cursor = NULL; + if (block_manager_cursor_init(&cursor, wal) != 0) return TDB_ERR_IO; + + if (block_manager_cursor_goto_first(cursor) == 0) + { + do + { + block_manager_block_t *block = block_manager_cursor_read(cursor); + if (!block) break; + + const uint8_t *ptr = block->data; + size_t remaining = block->size; + + /* we check and skip the unified magic prefix */ + if (remaining >= TDB_UNIFIED_WAL_MAGIC_SIZE) + { + const uint16_t magic = ((uint16_t)ptr[0] << 8) | ptr[1]; + if (magic == TDB_UNIFIED_WAL_MAGIC) + { + ptr += TDB_UNIFIED_WAL_MAGIC_SIZE; + remaining -= TDB_UNIFIED_WAL_MAGIC_SIZE; + } + } + + uint32_t max_cf_index_seen = 0; + while (remaining > TDB_UNIFIED_CF_PREFIX_SIZE) + { + /* we read cf_index */ + const uint32_t cf_index = tdb_decode_be32(ptr); + if (cf_index > max_cf_index_seen) max_cf_index_seen = cf_index; + ptr += TDB_UNIFIED_CF_PREFIX_SIZE; + remaining -= TDB_UNIFIED_CF_PREFIX_SIZE; + + if (remaining < 1) break; + const uint8_t flags = *ptr++; + remaining--; + + uint64_t key_size_u64; + int br = decode_varint(ptr, &key_size_u64, (int)remaining); + if (br < 0 || key_size_u64 > UINT32_MAX) break; + ptr += br; + remaining -= br; + + uint64_t value_size_u64; + br = decode_varint(ptr, &value_size_u64, (int)remaining); + if (br < 0 || value_size_u64 > UINT32_MAX) break; + ptr += br; + remaining -= br; + + uint64_t seq_value; + br = decode_varint(ptr, &seq_value, (int)remaining); + if (br < 0) break; + ptr += br; + remaining -= br; + + int64_t ttl = 0; + if (flags & TDB_KV_FLAG_HAS_TTL) + { + if (remaining < sizeof(int64_t)) break; + ttl = decode_int64_le_compat(ptr); + ptr += sizeof(int64_t); + remaining -= sizeof(int64_t); + } + + if (remaining < key_size_u64) break; + const uint8_t *key = ptr; + ptr += key_size_u64; + remaining -= key_size_u64; + + const uint8_t *value = NULL; + if (value_size_u64 > 0) + { + if (remaining < value_size_u64) break; + value = ptr; + ptr += value_size_u64; + remaining -= value_size_u64; + } + + /* we build prefixed key and insert into unified memtable */ + const size_t pk_total = TDB_UNIFIED_CF_PREFIX_SIZE + key_size_u64; + TDB_PREFIXED_KEY_ALLOC(prefixed, pk_total, _pk_stack4); + if (!prefixed) break; + tdb_encode_be32(cf_index, prefixed); + memcpy(prefixed + TDB_UNIFIED_CF_PREFIX_SIZE, key, key_size_u64); + const size_t pk_size = TDB_UNIFIED_CF_PREFIX_SIZE + key_size_u64; + + const int is_delete = (flags & TDB_KV_FLAG_TOMBSTONE) ? 1 : 0; + /* preserve the single-delete subtype across replay (mirrors per-CF WAL + * replay) so compaction can still pair-cancel put+single-delete */ + int sl_flags = is_delete ? SKIP_LIST_FLAG_DELETED : 0; + if (is_delete && (flags & TDB_KV_FLAG_SINGLE_DELETE)) + sl_flags |= SKIP_LIST_FLAG_SINGLE_DELETE; + skip_list_put_with_seq( + target, prefixed, pk_size, is_delete ? NULL : (uint8_t *)value, + is_delete ? 0 : (size_t)value_size_u64, ttl, seq_value, sl_flags); + TDB_PREFIXED_KEY_FREE(prefixed, _pk_stack4); + + if (seq_value > *max_seq) *max_seq = seq_value; + (*total_entries)++; + } + + /* we must ensure next_cf_index is past any cf_index seen in the WAL */ + if (max_cf_index_seen > 0) + { + uint32_t needed = max_cf_index_seen + 1; + uint32_t current = + atomic_load_explicit(&db->unified_mt.next_cf_index, memory_order_relaxed); + while (needed > current) + { + if (atomic_compare_exchange_weak_explicit( + &db->unified_mt.next_cf_index, ¤t, needed, memory_order_relaxed, + memory_order_relaxed)) + break; + } + } + + block_manager_block_release(block); + } while (block_manager_cursor_next(cursor) == 0); + } + + block_manager_cursor_free(cursor); + return TDB_SUCCESS; +} + +/** + * tidesdb_unified_wal_recover + * recover unified WAL files from db_path into the unified active memtable. + * the highest-generation uwal_*.log is the active memtable's wal (adopted + + * validated at open) -- its entries are replayed in place from the live block + * manager and the file is kept. lower-generation uwal files are replayed and + * then deleted. unified WAL entry format is documented on + * tidesdb_unified_wal_replay_into. + * @param db database instance + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_unified_wal_recover(tidesdb_t *db) +{ + if (!db || !db->unified_mt.enabled) return TDB_SUCCESS; + + tidesdb_memtable_t *umt = atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (!umt || !umt->skip_list) return TDB_ERR_UNKNOWN; + + DIR *dir = opendir(db->db_path); + if (!dir) return TDB_SUCCESS; /* no directory = fresh start */ + + queue_t *wal_files = queue_new(); + if (!wal_files) + { + closedir(dir); + return TDB_ERR_MEMORY; + } + + /* we collect every uwal_*.log -- the highest generation is the active + * memtable's wal (adopted + validated at open), replayed in place; the + * lower generations are replayed then deleted */ + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) + { + if (strstr(entry->d_name, TDB_UNIFIED_WAL_PREFIX) == entry->d_name && + strstr(entry->d_name, TDB_WAL_EXT) != NULL) + { + const size_t path_len = strlen(db->db_path) + strlen(entry->d_name) + 2; + char *wal_path = malloc(path_len); + if (wal_path) + { + snprintf(wal_path, path_len, "%s" PATH_SEPARATOR "%s", db->db_path, entry->d_name); + if (queue_enqueue(wal_files, wal_path) != 0) free(wal_path); + } + } + } + closedir(dir); + + if (queue_is_empty(wal_files)) + { + queue_free(wal_files); + return TDB_SUCCESS; + } + + tidesdb_sort_wal_files(wal_files); + + int total_entries = 0; + uint64_t max_seq = 0; + + /* the active memtable adopted the highest-generation uwal at open (already + * open + validated). replay that one in place from the live block manager + * and keep the file; replay + delete the lower generations. */ + const uint64_t active_gen = + atomic_load_explicit(&db->unified_mt.wal_generation, memory_order_relaxed); + + while (!queue_is_empty(wal_files)) + { + char *wal_path = queue_dequeue(wal_files); + if (!wal_path) continue; + + const char *wal_name = strrchr(wal_path, PATH_SEPARATOR[0]); + wal_name = wal_name ? wal_name + 1 : wal_path; + uint64_t gen = 0; + const int parsed = tdb_parse_unified_wal_gen(wal_name, &gen); + + if (parsed && umt->wal && gen == active_gen) + { + /* the active unified WAL -- replay in place from the live block + * manager, keep the file (it backs the active memtable) */ + const int rc = tidesdb_unified_wal_replay_into(db, umt->wal, umt->skip_list, &max_seq, + &total_entries); + if (rc != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to replay active unified WAL '%s'", wal_path); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Replayed active unified WAL in place '%s'", wal_path); + } + free(wal_path); + continue; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Recovering unified WAL '%s'", wal_path); + + block_manager_t *wal = NULL; + if (block_manager_open(&wal, wal_path, TDB_SYNC_FULL) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to open unified WAL file '%s'", wal_path); + free(wal_path); + continue; + } + + if (block_manager_validate_last_block(wal, BLOCK_MANAGER_PERMISSIVE_BLOCK_VALIDATION) != 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Unified WAL validation failed for '%s'", wal_path); + block_manager_close(wal); + free(wal_path); + continue; + } + + tidesdb_unified_wal_replay_into(db, wal, umt->skip_list, &max_seq, &total_entries); + + block_manager_close(wal); + + /* we delete the recovered lower-generation WAL -- its entries are now in + * the active unified memtable */ + tdb_unlink(wal_path); + free(wal_path); + } + + queue_free(wal_files); + + /* we update global_seq if recovered entries have higher sequence numbers */ + if (max_seq > 0) + { + uint64_t current_seq = atomic_load_explicit(&db->global_seq, memory_order_acquire); + if (max_seq >= current_seq) + { + atomic_store_explicit(&db->global_seq, max_seq + 1, memory_order_release); + } + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Unified WAL recovery completed '%d' entries, max_seq=%" PRIu64, + total_entries, max_seq); + + return TDB_SUCCESS; +} + +/** + * tidesdb_recover_database + * recover entire database from disk + * @param db database to recover + * @return error code + */ +static int tidesdb_recover_database(tidesdb_t *db) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Starting database recovery from '%s'", db->db_path); + + /*** if local directory is empty or missing but object store has data, + ** discover CFs from remote and download their config.ini + MANIFEST + * before scanning locally. we first record whether any CF directory + ** already exists locally -- a genuine cold start (none present) must also + *** replay the remote WALs once sstable recovery is done. */ + int objstore_cold_start = 0; + if (db->object_store) + { + int local_cf_dir_seen = 0; + DIR *probe_dir = opendir(db->db_path); + if (probe_dir) + { + struct dirent *probe_ent; + while ((probe_ent = readdir(probe_dir)) != NULL) + { + if (probe_ent->d_name[0] == '.') continue; + char probe_path[MAX_FILE_PATH_LENGTH]; + snprintf(probe_path, sizeof(probe_path), "%s%s%s", db->db_path, PATH_SEPARATOR, + probe_ent->d_name); + struct STAT_STRUCT probe_st; + if (STAT_FUNC(probe_path, &probe_st) == 0 && S_ISDIR(probe_st.st_mode)) + { + local_cf_dir_seen = 1; + break; + } + } + closedir(probe_dir); + } + objstore_cold_start = !local_cf_dir_seen; + tdb_objstore_cold_start_discover(db); + } + + DIR *dir = opendir(db->db_path); + if (!dir) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "No existing database directory found (fresh start)"); + return TDB_SUCCESS; /* not an error, fresh database */ + } + + struct dirent *entry; + while ((entry = readdir(dir)) != NULL) + { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) + { + continue; + } + + char full_path[MAX_FILE_PATH_LENGTH]; + snprintf(full_path, sizeof(full_path), "%s%s%s", db->db_path, PATH_SEPARATOR, + entry->d_name); + + struct STAT_STRUCT st; + if (STAT_FUNC(full_path, &st) == 0 && S_ISDIR(st.st_mode)) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Found CF directory '%s'", entry->d_name); + tidesdb_column_family_t *cf = tidesdb_get_column_family_internal(db, entry->d_name); + + if (!cf) + { + tidesdb_column_family_config_t config = tidesdb_default_column_family_config(); + + /* we ensure we have room for full_path + "/" + "config.ini" + null terminator */ + const size_t full_path_len = strlen(full_path); + if (full_path_len + 1 + + strlen(TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT) >= + TDB_MAX_PATH_LEN) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' config path too long, using defaults", + entry->d_name); + goto create_cf_with_config; + } + + char config_path[TDB_MAX_PATH_LEN]; +#ifndef _MSC_VER +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-truncation" +#endif + snprintf( + config_path, TDB_MAX_PATH_LEN, + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + full_path); +#ifndef _MSC_VER +#pragma GCC diagnostic pop +#endif + + if (tidesdb_cf_config_load_from_ini(config_path, entry->d_name, &config) == + TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' has loaded config from disk (write_buffer_size=%zu, " + "level_size_ratio=%zu)", + entry->d_name, config.write_buffer_size, config.level_size_ratio); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' has no saved config found, using defaults", + entry->d_name); + } + + create_cf_with_config:; + const int create_result = tidesdb_create_column_family(db, entry->d_name, &config); + + if (create_result == TDB_SUCCESS) + { + cf = tidesdb_get_column_family_internal(db, entry->d_name); + } + else if (create_result == TDB_ERR_EXISTS) + { + /* CF already exists in memory, we try to get it again */ + cf = tidesdb_get_column_family_internal(db, entry->d_name); + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF already exists during recovery '%s'", + entry->d_name); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Failed to create CF during recovery '%s' (error code: %d)", + entry->d_name, create_result); + } + } + + if (cf) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Recovering CF '%s'", entry->d_name); + tidesdb_recover_column_family(cf); + } + else + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to get/create CF '%s'", entry->d_name); + } + } + } + closedir(dir); + + /* we recover unified WAL files after all CFs are recovered */ + if (db->unified_mt.enabled) + { + tidesdb_unified_wal_recover(db); + } + + /*** on a cold start the reconstructed sstables cover only flushed data -- + ** committed-but-unflushed writes live solely in the WALs that + * wal_sync_on_commit / replicate_wal uploaded to the object store. replay + ** those remote WALs into the unified memtable so a primary rebuilt from + *** the object store does not lose acknowledged writes. seq numbers make the + ** replay idempotent, so generations already covered by recovered sstables + * are skipped. object store mode always uses a unified memtable. */ + if (objstore_cold_start && db->object_store && db->unified_mt.enabled) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, + "Cold start replaying remote WALs from object store for CF recovery"); + tdb_objstore_replay_remote_wals(db, 1); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Database recovery completed successfully"); + return TDB_SUCCESS; +} + +int tidesdb_get_stats(tidesdb_column_family_t *cf, tidesdb_stats_t **stats) +{ + if (!cf || !stats) return TDB_ERR_INVALID_ARGS; + + *stats = calloc(1, sizeof(tidesdb_stats_t)); + if (!*stats) return TDB_ERR_MEMORY; + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + (*stats)->num_levels = num_levels; + tidesdb_memtable_t *active_mt_struct = + atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + skip_list_t *active_mt = active_mt_struct ? active_mt_struct->skip_list : NULL; + (*stats)->memtable_size = skip_list_get_size(active_mt); + + (*stats)->level_sizes = malloc((*stats)->num_levels * sizeof(size_t)); + (*stats)->level_num_sstables = malloc((*stats)->num_levels * sizeof(int)); + (*stats)->level_key_counts = malloc((*stats)->num_levels * sizeof(uint64_t)); + (*stats)->level_tombstone_counts = malloc((*stats)->num_levels * sizeof(uint64_t)); + (*stats)->config = malloc(sizeof(tidesdb_column_family_config_t)); + + if (!(*stats)->level_sizes || !(*stats)->level_num_sstables || !(*stats)->level_key_counts || + !(*stats)->level_tombstone_counts || !(*stats)->config) + { + free((*stats)->level_sizes); + free((*stats)->level_num_sstables); + free((*stats)->level_key_counts); + free((*stats)->level_tombstone_counts); + free((*stats)->config); + free(*stats); + return TDB_ERR_MEMORY; + } + + memcpy((*stats)->config, &cf->config, sizeof(tidesdb_column_family_config_t)); + + /* we count memtable keys */ + const uint64_t memtable_keys = active_mt ? (uint64_t)skip_list_count_entries(active_mt) : 0; + uint64_t total_keys = memtable_keys; + uint64_t total_data_size = 0; + uint64_t total_klog_size = 0; + + /* immutable memtables still hold live data not yet on disk -- fold their + * bytes into memtable_size and their entries into total_keys. a flushed + * immutable is skipped, its data is already on disk and counted in the + * sstable totals, and it only lingers in the queue until batched cleanup */ + if (cf->immutable_memtables) + { + queue_t *iq = cf->immutable_memtables; + pthread_rwlock_rdlock(&iq->read_lock); + for (queue_node_t *n = iq->head->next; n != NULL; n = n->next) + { + tidesdb_immutable_memtable_t *imm = (tidesdb_immutable_memtable_t *)n->data; + if (imm && imm->skip_list && !atomic_load_explicit(&imm->flushed, memory_order_acquire)) + { + (*stats)->memtable_size += skip_list_get_size(imm->skip_list); + total_keys += (uint64_t)skip_list_count_entries(imm->skip_list); + } + } + pthread_rwlock_unlock(&iq->read_lock); + } + + /* btree stats aggregation */ + uint64_t btree_total_nodes = 0; + uint32_t btree_max_height = 0; + uint64_t btree_height_sum = 0; + int btree_sstable_count = 0; + + /* tombstone observability aggregation */ + uint64_t total_tombstones = 0; + double max_density = 0.0; + int max_density_level = 0; + + for (int i = 0; i < (*stats)->num_levels; i++) + { + (*stats)->level_sizes[i] = atomic_load(&cf->levels[i]->current_size); + int num_sstables = atomic_load_explicit(&cf->levels[i]->num_sstables, memory_order_acquire); + (*stats)->level_num_sstables[i] = num_sstables; + + /* we count keys per level from sstables. we hold array_readers across + * the walk so a concurrent compaction cannot retire and free the + * sstables array, or unref a removed sstable, while we read per-sstable + * fields. the array is calloc(capacity + 1) and NULL terminated, so the + * NULL-bounded loop cannot run off the end */ + uint64_t level_keys = 0; + uint64_t level_tombstones = 0; + tidesdb_level_t *lvl = cf->levels[i]; + atomic_fetch_add_explicit(&lvl->array_readers, 1, memory_order_acq_rel); + tidesdb_sstable_t **sstables = atomic_load_explicit(&lvl->sstables, memory_order_acquire); + for (int j = 0; sstables[j] != NULL; j++) + { + tidesdb_sstable_t *sst = sstables[j]; + level_keys += sst->num_entries; + total_data_size += sst->klog_size + sst->vlog_size; + total_klog_size += sst->klog_size; + + /* we aggregate btree stats if this sstable uses btree */ + if (sst->use_btree && sst->btree_root_offset >= 0) + { + btree_sstable_count++; + btree_total_nodes += sst->btree_node_count; + btree_height_sum += sst->btree_height; + if (sst->btree_height > btree_max_height) + { + btree_max_height = sst->btree_height; + } + } + + /** sstables with unknown tombstone counts (legacy footers) contribute + * nothing to the totals or the max-density witness */ + if (sst->tombstone_count != TDB_TOMBSTONE_COUNT_UNKNOWN) + { + level_tombstones += sst->tombstone_count; + if (sst->num_entries > 0) + { + const double d = (double)sst->tombstone_count / (double)sst->num_entries; + if (d > max_density) + { + max_density = d; + max_density_level = i + 1; + } + } + } + } + atomic_fetch_sub_explicit(&lvl->array_readers, 1, memory_order_release); + (*stats)->level_key_counts[i] = level_keys; + (*stats)->level_tombstone_counts[i] = level_tombstones; + total_keys += level_keys; + total_tombstones += level_tombstones; + } + + /* we populate btree stats */ + (*stats)->use_btree = cf->config.use_btree; + (*stats)->btree_total_nodes = btree_total_nodes; + (*stats)->btree_max_height = btree_max_height; + (*stats)->btree_avg_height = + btree_sstable_count > 0 ? (double)btree_height_sum / btree_sstable_count : 0.0; + + (*stats)->total_keys = total_keys; + (*stats)->total_data_size = total_data_size; + + (*stats)->total_tombstones = total_tombstones; + (*stats)->tombstone_ratio = + total_keys > 0 ? (double)total_tombstones / (double)total_keys : 0.0; + (*stats)->max_sst_density = max_density; + (*stats)->max_sst_density_level = max_density_level; + + /* we estimate avg key/value sizes from memtable size and sstable data */ + if (total_keys > 0) + { + /* the memtable tracks total_size as key_size + value_size for each entry */ + const uint64_t memtable_data_size = (*stats)->memtable_size; + const uint64_t total_kv_size = memtable_data_size + total_klog_size; + double avg_entry_size = (double)total_kv_size / (double)total_keys; + /* we assume roughly equal key/value split as approximation */ + (*stats)->avg_key_size = avg_entry_size * TDB_STATS_AVG_KEY_FRACTION; + (*stats)->avg_value_size = avg_entry_size * TDB_STATS_AVG_VALUE_FRACTION; + } + else + { + (*stats)->avg_key_size = 0.0; + (*stats)->avg_value_size = 0.0; + } + + /** we calculate read amplification -- worst case is 1 (memtable) + the L0 + * immutable memtable queue + sum of sstables per level. levels[0] is L1 + * (first sstable level), L0 is the immutable memtables queue */ + double read_amp = 1.0; /* memtable lookup */ + + /* L0 -- every immutable memtable is also scanned on a point read. in unified + * mode the immutables live on the shared unified queue */ + read_amp += (double)((cf->db && cf->db->unified_mt.enabled && cf->db->unified_mt.immutables) + ? queue_size(cf->db->unified_mt.immutables) + : queue_size(cf->immutable_memtables)); + + for (int i = 0; i < (*stats)->num_levels; i++) + { + /* L1 (levels[0]) may have overlapping sstables from flushes, L2+ are sorted/non-overlapping + */ + if (i == 0) + { + read_amp += (*stats)->level_num_sstables[i]; + } + else + { + read_amp += ((*stats)->level_num_sstables[i] > 0 ? 1.0 : 0.0); + } + } + (*stats)->read_amp = read_amp; + + /* we get cache hit rate from database if available */ + (*stats)->hit_rate = 0.0; + if (cf->db && cf->db->clock_cache) + { + tidesdb_cache_stats_t cache_stats; + if (tidesdb_get_cache_stats(cf->db, &cache_stats) == TDB_SUCCESS && cache_stats.enabled) + { + (*stats)->hit_rate = cache_stats.hit_rate; + } + } + + return TDB_SUCCESS; +} + +void tidesdb_free_stats(tidesdb_stats_t *stats) +{ + if (!stats) return; + free(stats->level_sizes); + free(stats->level_num_sstables); + free(stats->level_key_counts); + free(stats->level_tombstone_counts); + free(stats->config); + free(stats); +} + +int tidesdb_get_db_stats(tidesdb_t *db, tidesdb_db_stats_t *stats) +{ + if (!db || !stats) return TDB_ERR_INVALID_ARGS; + + memset(stats, 0, sizeof(tidesdb_db_stats_t)); + + stats->total_memory = db->total_memory; + stats->available_memory = db->available_memory; + stats->resolved_memory_limit = + atomic_load_explicit(&db->resolved_memory_limit, memory_order_relaxed); + stats->memory_pressure_level = + atomic_load_explicit(&db->memory_pressure_level, memory_order_relaxed); + stats->flush_pending_count = + atomic_load_explicit(&db->flush_pending_count, memory_order_relaxed); + stats->num_open_sstables = atomic_load_explicit(&db->num_open_sstables, memory_order_relaxed); + stats->global_seq = atomic_load_explicit(&db->global_seq, memory_order_relaxed); + stats->txn_memory_bytes = atomic_load_explicit(&db->txn_memory_bytes, memory_order_relaxed); + /* total_memtable_bytes is the live skip list bytes of every memtable, active + * and immutable, across all column families and the unified memtable -- it + * is summed below, not taken from the reaper's whole-memory pressure total */ + + if (db->flush_queue) stats->flush_queue_size = queue_size(db->flush_queue); + if (db->compaction_queue) stats->compaction_queue_size = queue_size(db->compaction_queue); + + pthread_rwlock_rdlock(&db->cf_list_lock); + stats->num_column_families = db->num_column_families; + + for (int c = 0; c < db->num_column_families; c++) + { + tidesdb_column_family_t *cf = db->column_families[c]; + if (!cf) continue; + + stats->total_immutable_count += (int)queue_size(cf->immutable_memtables); + + /* per-cf active memtable + its immutable queue contribute to the + * memtable byte total (empty in unified mode, summed below instead). + * flushed immutables are skipped, their bytes are already on disk */ + tidesdb_memtable_t *amt = atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + if (amt && amt->skip_list) + stats->total_memtable_bytes += (int64_t)skip_list_get_size(amt->skip_list); + if (cf->immutable_memtables) + { + queue_t *iq = cf->immutable_memtables; + pthread_rwlock_rdlock(&iq->read_lock); + for (queue_node_t *n = iq->head->next; n != NULL; n = n->next) + { + tidesdb_immutable_memtable_t *imm = (tidesdb_immutable_memtable_t *)n->data; + if (imm && imm->skip_list && + !atomic_load_explicit(&imm->flushed, memory_order_acquire)) + stats->total_memtable_bytes += (int64_t)skip_list_get_size(imm->skip_list); + } + pthread_rwlock_unlock(&iq->read_lock); + } + + int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + for (int l = 0; l < num_levels; l++) + { + tidesdb_level_t *lvl = cf->levels[l]; + if (!lvl) continue; + stats->total_sstable_count += + atomic_load_explicit(&lvl->num_sstables, memory_order_relaxed); + + /* we sum the level's maintained byte counter rather than walking the + * sstables array -- a concurrent compaction can retire and free that + * array, and current_size already tracks the klog+vlog bytes the + * per-sstable walk would otherwise add up */ + stats->total_data_size_bytes += + (int64_t)atomic_load_explicit(&lvl->current_size, memory_order_relaxed); + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* unified memtable stats */ + stats->unified_memtable_enabled = db->unified_mt.enabled; + if (db->unified_mt.enabled) + { + tidesdb_memtable_t *umt = + atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (umt && umt->skip_list) + stats->unified_memtable_bytes = (int64_t)skip_list_get_size(umt->skip_list); + stats->total_memtable_bytes += stats->unified_memtable_bytes; + + if (db->unified_mt.immutables) + { + stats->unified_immutable_count = (int)queue_size(db->unified_mt.immutables); + stats->total_immutable_count += stats->unified_immutable_count; + + /* unified immutable queue bytes also count toward total_memtable_bytes, + * except flushed immutables whose bytes are already on disk */ + queue_t *uiq = db->unified_mt.immutables; + pthread_rwlock_rdlock(&uiq->read_lock); + for (queue_node_t *n = uiq->head->next; n != NULL; n = n->next) + { + tidesdb_memtable_t *uimm = (tidesdb_memtable_t *)n->data; + if (uimm && uimm->skip_list && + !atomic_load_explicit(&uimm->flushed, memory_order_acquire)) + stats->total_memtable_bytes += (int64_t)skip_list_get_size(uimm->skip_list); + } + pthread_rwlock_unlock(&uiq->read_lock); + } + + stats->unified_is_flushing = + atomic_load_explicit(&db->unified_mt.is_flushing, memory_order_relaxed); + stats->unified_next_cf_index = + atomic_load_explicit(&db->unified_mt.next_cf_index, memory_order_relaxed); + stats->unified_wal_generation = + atomic_load_explicit(&db->unified_mt.wal_generation, memory_order_relaxed); + } + + /* object store stats */ + stats->object_store_enabled = (db->object_store != NULL); + if (db->object_store) + { + stats->object_store_connector = tidesdb_objstore_backend_name(db->object_store->backend); + stats->last_uploaded_generation = + atomic_load_explicit(&db->last_uploaded_gen, memory_order_relaxed); + stats->total_uploads = atomic_load_explicit(&db->total_uploads, memory_order_relaxed); + stats->total_upload_failures = + atomic_load_explicit(&db->total_upload_failures, memory_order_relaxed); + if (db->upload_queue) stats->upload_queue_depth = queue_size(db->upload_queue); + if (db->local_cache) + { + stats->local_cache_bytes_used = + atomic_load_explicit(&db->local_cache->current_bytes, memory_order_relaxed); + stats->local_cache_bytes_max = db->local_cache->max_bytes; + stats->local_cache_num_files = + atomic_load_explicit(&db->local_cache->num_entries, memory_order_relaxed); + } + } + + stats->replica_mode = atomic_load_explicit(&db->replica_mode, memory_order_relaxed); + + return TDB_SUCCESS; +} + +int tidesdb_purge_cf(tidesdb_column_family_t *cf) +{ + if (!cf || !cf->db) return TDB_ERR_INVALID_ARGS; + + /*** if unified memtable mode is enabled, we rotate memtable first so that any + ** entries belonging to this CF are moved to the flush queue. + * the same pattern as tidesdb_purge() but scoped to a single CF call. */ + tidesdb_t *db = cf->db; + if (db->unified_mt.enabled) + { + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&db->unified_mt.is_flushing, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + tidesdb_memtable_t *umt = + atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (umt && umt->skip_list && skip_list_count_entries(umt->skip_list) > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Rotating unified memtable for CF '%s'", cf->name); + tidesdb_unified_memtable_rotate(db); + } + atomic_store_explicit(&db->unified_mt.is_flushing, 0, memory_order_release); + } + + /* we wait for unified flush to complete */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 4; i++) + { + const size_t fq = db->flush_queue ? queue_size(db->flush_queue) : 0; + int pending = atomic_load_explicit(&db->flush_pending_count, memory_order_acquire); + if (fq == 0 && pending == 0) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + } + + /* we wait for any in-progress flush to finish */ + for (int i = 0; i < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (!tidesdb_is_flushing(cf)) break; + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + } + + /* we force flush the active memtable (even if below threshold) */ + const int result = tidesdb_flush_memtable_internal(cf, 0, 1); + if (result != TDB_SUCCESS && result != TDB_ERR_MEMORY) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Flush failed for CF '%s' (err=%d)", cf->name, result); + return result; + } + + /* we wait for flush I/O to fully complete */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 2; i++) + { + if (!tidesdb_is_flushing(cf)) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + /* we wait for any in-progress compaction to finish */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (!tidesdb_is_compacting(cf)) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + /* we trigger compaction (synchronous -- tidesdb_trigger_compaction runs inline) */ + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&cf->is_compacting, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + tidesdb_trigger_compaction(cf, 0); + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + } + + /* we wait for any queued compaction to drain */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (!tidesdb_is_compacting(cf)) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "CF '%s' purge complete", cf->name); + return TDB_SUCCESS; +} + +int tidesdb_cancel_background_work(tidesdb_t *db) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + + /* in-flight merges bail at their next checkpoint + * (uncommitted output discarded, inputs intact -- safe), and queued compaction + * work items are skipped at dequeue. flushes are deliberately unaffected so + * durability is preserved. the flag is sticky for this db session and is reset + * on the next tidesdb_open; intended to be called right before tidesdb_close for + * a fast shutdown when a large compaction backlog would otherwise stall close. */ + atomic_store_explicit(&db->cancel_compaction, 1, memory_order_release); + TDB_DEBUG_LOG(TDB_LOG_INFO, "Cancelling compaction"); + + /* wait until the compaction queue is empty, no CF is mid-merge, AND no CF has a + * pending count outstanding -- pending_count is incremented before queue_enqueue + * and decremented after the worker's skip/finish, so there are windows where + * queue=0 and is_compacting=0 but the work item is still in flight. tidesdb_is_compacting + * factors pending_count in, so a caller that reads it right after cancel returns must + * see all three drained. bounded so a merge stuck outside a checkpoint cannot hang + * the caller forever. */ + int waited_ms = 0; + while (waited_ms < TDB_CANCEL_BG_MAX_WAIT_MS) + { + int busy = 0; + if (db->compaction_queue && queue_size(db->compaction_queue) > 0) busy = 1; + if (!busy) + { + pthread_rwlock_rdlock(&db->cf_list_lock); + const int n = atomic_load_explicit(&db->num_column_families, memory_order_acquire); + for (int i = 0; i < n; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (cf && + (atomic_load_explicit(&cf->is_compacting, memory_order_acquire) || + atomic_load_explicit(&cf->compaction_pending_count, memory_order_acquire) > 0)) + { + busy = 1; + break; + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + } + if (!busy) break; + usleep(TDB_CANCEL_BG_POLL_US); + waited_ms += TDB_CANCEL_BG_POLL_US / 1000; + } + + if (waited_ms >= TDB_CANCEL_BG_MAX_WAIT_MS) + TDB_DEBUG_LOG(TDB_LOG_WARN, + "Timed out after %d ms with compaction still " + "in flight", + waited_ms); + else + TDB_DEBUG_LOG(TDB_LOG_INFO, "Compaction quiesced after %d ms", waited_ms); + return TDB_SUCCESS; +} + +int tidesdb_purge(tidesdb_t *db) +{ + if (!db) return TDB_ERR_INVALID_ARGS; + + int first_err = TDB_SUCCESS; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Starting full database purge"); + + /** we flush unified active memtable before per-CF purge so that the resulting + * ssts are included in the per-CF compaction pass that follows */ + if (db->unified_mt.enabled) + { + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&db->unified_mt.is_flushing, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + tidesdb_memtable_t *umt = + atomic_load_explicit(&db->unified_mt.active, memory_order_acquire); + if (umt && umt->skip_list && skip_list_count_entries(umt->skip_list) > 0) + { + TDB_DEBUG_LOG(TDB_LOG_INFO, "Rotating unified memtable"); + tidesdb_unified_memtable_rotate(db); + } + atomic_store_explicit(&db->unified_mt.is_flushing, 0, memory_order_release); + } + + /* we wait for the unified flush to complete before per-CF work */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 4; i++) + { + const size_t fq = db->flush_queue ? queue_size(db->flush_queue) : 0; + int pending = atomic_load_explicit(&db->flush_pending_count, memory_order_acquire); + if (fq == 0 && pending == 0) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + } + + /* purge each CF, we flush + compact */ + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (!cf) continue; + + const int result = tidesdb_purge_cf(cf); + if (result != TDB_SUCCESS && first_err == TDB_SUCCESS) first_err = result; + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* we drain flush queue completely */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 4; i++) + { + const size_t fq = db->flush_queue ? queue_size(db->flush_queue) : 0; + int pending = atomic_load_explicit(&db->flush_pending_count, memory_order_acquire); + if (fq == 0 && pending == 0) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + /* we drain compaction queue completely */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 4; i++) + { + const size_t cq = db->compaction_queue ? queue_size(db->compaction_queue) : 0; + if (cq == 0) break; + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Full database purge complete"); + return first_err; +} + +/** + * tidesdb_range_cost_key_fraction + * estimate the fraction of an sstable's key range covered by [lo, hi] + * uses byte-level interpolation on min/max keys when block indexes are unavailable + * @param lo lower bound key + * @param lo_size lower bound key size + * @param hi upper bound key + * @param hi_size upper bound key size + * @param sst_min sstable min key + * @param sst_min_size sstable min key size + * @param sst_max sstable max key + * @param sst_max_size sstable max key size + * @return fraction in [0.0, 1.0] + */ +static double tidesdb_range_cost_key_fraction(const uint8_t *lo, const size_t lo_size, + const uint8_t *hi, const size_t hi_size, + const uint8_t *sst_min, const size_t sst_min_size, + const uint8_t *sst_max, const size_t sst_max_size) +{ + /* we use leading bytes to compute a numeric position within the sst range + * this is crude but O(1) and sufficient for comparative cost estimation */ + const size_t prefix_bytes = 8; + + /* we convert leading bytes of each key to a uint64 for interpolation */ + uint64_t val_sst_min = 0, val_sst_max = 0, val_lo = 0, val_hi = 0; + for (size_t i = 0; i < prefix_bytes; i++) + { + const unsigned int shift = (unsigned int)((prefix_bytes - 1 - i) * 8); + val_sst_min |= (uint64_t)(i < sst_min_size ? sst_min[i] : 0) << shift; + val_sst_max |= (uint64_t)(i < sst_max_size ? sst_max[i] : 0) << shift; + val_lo |= (uint64_t)(i < lo_size ? lo[i] : 0) << shift; + val_hi |= (uint64_t)(i < hi_size ? hi[i] : 0) << shift; + } + + if (val_sst_max <= val_sst_min) return 1.0; /* degenerate range, assume full scan */ + + /* we clamp the query range to the sstable range */ + if (val_lo < val_sst_min) val_lo = val_sst_min; + if (val_hi > val_sst_max) val_hi = val_sst_max; + if (val_hi <= val_lo) return 0.0; + + const double sst_span = (double)(val_sst_max - val_sst_min); + const double query_span = (double)(val_hi - val_lo); + + double fraction = query_span / sst_span; + if (fraction > 1.0) fraction = 1.0; + if (fraction < 0.0) fraction = 0.0; + + return fraction; +} + +int tidesdb_range_cost(tidesdb_column_family_t *cf, const uint8_t *key_a, const size_t key_a_size, + const uint8_t *key_b, const size_t key_b_size, double *cost) +{ + if (!cf || !key_a || !key_b || key_a_size == 0 || key_b_size == 0 || !cost) + return TDB_ERR_INVALID_ARGS; + + *cost = 0.0; + + /* we resolve comparator to determine key ordering */ + skip_list_comparator_fn comparator_fn = NULL; + void *comparator_ctx = NULL; + tidesdb_resolve_comparator(cf->db, &cf->config, &comparator_fn, &comparator_ctx); + if (!comparator_fn) comparator_fn = skip_list_comparator_memcmp; + + /* we ensure lo <= hi */ + const uint8_t *lo = key_a; + size_t lo_size = key_a_size; + const uint8_t *hi = key_b; + size_t hi_size = key_b_size; + + if (comparator_fn(lo, lo_size, hi, hi_size, comparator_ctx) > 0) + { + lo = key_b; + lo_size = key_b_size; + hi = key_a; + hi_size = key_a_size; + } + + double total_cost = 0.0; + int overlapping_sources = 0; + + /* we walk all levels and sstables using the same pattern as tidesdb_get_stats */ + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + + for (int i = 0; i < num_levels; i++) + { + tidesdb_level_t *level = cf->levels[i]; + + /* we hold array_readers across the walk so a concurrent compaction + * cannot retire and free the sstables array, or unref a removed + * sstable, while we read per-sstable fields below. the array is + * calloc(capacity + 1) and NULL terminated, so the NULL-bounded loop + * cannot run off the end */ + atomic_fetch_add_explicit(&level->array_readers, 1, memory_order_acq_rel); + tidesdb_sstable_t **sstables = atomic_load_explicit(&level->sstables, memory_order_acquire); + + for (int j = 0; sstables[j] != NULL; j++) + { + tidesdb_sstable_t *sst = sstables[j]; + + if (!sst->min_key || !sst->max_key) continue; + + /* we check range overlap; we skip if [lo, hi] does not intersect [min_key, max_key] */ + const int lo_vs_max = + comparator_fn(lo, lo_size, sst->max_key, sst->max_key_size, comparator_ctx); + if (lo_vs_max > 0) continue; /* lo is past this sstable */ + + const int hi_vs_min = + comparator_fn(hi, hi_size, sst->min_key, sst->min_key_size, comparator_ctx); + if (hi_vs_min < 0) continue; /* hi is before this sstable */ + + overlapping_sources++; + + /* we estimate the number of blocks in range */ + double est_blocks; + const double compression_weight = + (sst->config && sst->config->compression_algorithm != TDB_COMPRESS_NONE) + ? TDB_RANGE_COST_COMPRESSION_WEIGHT + : 1.0; + + if (sst->block_indexes && sst->block_indexes->count > 0) + { + /* we use block index slots to estimate block span */ + int64_t slot_a = 0, slot_b = 0; + const int found_a = + compact_block_index_find_slot(sst->block_indexes, lo, lo_size, &slot_a); + const int found_b = + compact_block_index_find_slot(sst->block_indexes, hi, hi_size, &slot_b); + + if (found_a == 0 && found_b == 0) + { + int64_t sampled_blocks = (slot_b - slot_a) + 1; + if (sampled_blocks < 1) sampled_blocks = 1; + + /* we scale by index_sample_ratio to get actual block count */ + const int sample_ratio = (sst->config && sst->config->index_sample_ratio > 0) + ? sst->config->index_sample_ratio + : 1; + est_blocks = (double)sampled_blocks * (double)sample_ratio; + + /* we clamp to actual block count */ + if (est_blocks > (double)sst->num_klog_blocks) + est_blocks = (double)sst->num_klog_blocks; + } + else + { + /* we fallback to full sstable if slot search failed */ + est_blocks = (double)sst->num_klog_blocks; + } + } + else if (sst->use_btree) + { + /** for btree sstables without block indexes we estimate from tree metadata + * leaf nodes are the data-bearing nodes; fraction of them is our cost proxy */ + const double fraction = tidesdb_range_cost_key_fraction( + lo, lo_size, hi, hi_size, sst->min_key, sst->min_key_size, sst->max_key, + sst->max_key_size); + + /* we use node_count as proxy for blocks (leaf nodes dominate) */ + est_blocks = fraction * (double)sst->btree_node_count; + if (est_blocks < 1.0 && fraction > 0.0) est_blocks = 1.0; + + /* we add btree height as seek cost per overlapping btree sst */ + total_cost += (double)sst->btree_height; + } + else + { + /* no block indexes -- we use key-fraction interpolation */ + const double fraction = tidesdb_range_cost_key_fraction( + lo, lo_size, hi, hi_size, sst->min_key, sst->min_key_size, sst->max_key, + sst->max_key_size); + + est_blocks = fraction * (double)sst->num_klog_blocks; + if (est_blocks < 1.0 && fraction > 0.0) est_blocks = 1.0; + } + + /* we estimate entries from block fraction */ + const double block_fraction = + (sst->num_klog_blocks > 0) ? est_blocks / (double)sst->num_klog_blocks : 1.0; + const double est_entries = (double)sst->num_entries * block_fraction; + + /* we accumulate cost--block I/O dominates, entries are cheap in comparison */ + total_cost += est_blocks * compression_weight; /* block read + decompress */ + total_cost += est_entries * TDB_RANGE_COST_PER_ENTRY_WEIGHT; /* per-entry processing */ + } + atomic_fetch_sub_explicit(&level->array_readers, 1, memory_order_release); + } + + /* we add merge overhead -- more overlapping sources means more heap operations */ + total_cost += (double)overlapping_sources * TDB_RANGE_COST_PER_SOURCE_WEIGHT; + + /* we add memtable contribution (small, in-memory, but included for completeness) */ + tidesdb_memtable_t *active_mt_struct = + atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + if (active_mt_struct && active_mt_struct->skip_list) + { + const int mt_entries = skip_list_count_entries(active_mt_struct->skip_list); + if (mt_entries > 0) + { + /*** we estimate fraction of memtable covered using skip_list min/max + ** memtables dont have min/max keys readily available, so we use a + * conservative estimate. we scale by total entries with small weight */ + total_cost += (double)mt_entries * TDB_RANGE_COST_MEMTABLE_WEIGHT; + } + } + + *cost = total_cost; + return TDB_SUCCESS; +} + +int tidesdb_get_cache_stats(tidesdb_t *db, tidesdb_cache_stats_t *stats) +{ + if (!db || !stats) return TDB_ERR_INVALID_ARGS; + + memset(stats, 0, sizeof(tidesdb_cache_stats_t)); + + if (!db->clock_cache) + { + stats->enabled = 0; + return TDB_SUCCESS; + } + + stats->enabled = 1; + + clock_cache_stats_t cache_stats; + clock_cache_get_stats(db->clock_cache, &cache_stats); + + stats->total_entries = cache_stats.total_entries; + stats->total_bytes = cache_stats.total_bytes; + stats->hits = cache_stats.hits; + stats->misses = cache_stats.misses; + stats->hit_rate = cache_stats.hit_rate; + stats->num_partitions = cache_stats.num_partitions; + + return TDB_SUCCESS; +} + +typedef enum +{ + TDB_BACKUP_COPY_IMMUTABLE = 1, + TDB_BACKUP_COPY_FINAL = 2 +} tidesdb_backup_copy_mode_t; + +/** + * tidesdb_backup_is_sstable_file + * checks if a filename is an sstable file (.klog or .vlog) + * @param name filename to check + * @return 1 if sstable file, 0 otherwise + */ +static int tidesdb_backup_is_sstable_file(const char *name) +{ + if (!name) return 0; + const char *ext = strrchr(name, '.'); + if (!ext) return 0; + return (strcmp(ext, TDB_SSTABLE_KLOG_EXT) == 0 || strcmp(ext, TDB_SSTABLE_VLOG_EXT) == 0); +} + +/** + * tidesdb_backup_is_wal_file + * checks if a filename is a WAL file (wal_*.log) + * @param name filename to check + * @return 1 if WAL file, 0 otherwise + */ +static int tidesdb_backup_is_wal_file(const char *name) +{ + if (!name) return 0; + const size_t name_len = strlen(name); + const size_t prefix_len = strlen(TDB_WAL_PREFIX); + const size_t ext_len = strlen(TDB_WAL_EXT); + if (name_len <= prefix_len + ext_len) return 0; + if (strncmp(name, TDB_WAL_PREFIX, prefix_len) != 0) return 0; + if (strcmp(name + name_len - ext_len, TDB_WAL_EXT) != 0) return 0; + return 1; +} + +/** + * tidesdb_backup_sstable_in_manifest + * checks if an sstable file is tracked in the column family manifest + * @param cf column family + * @param name sstable filename + * @return 1 if in manifest, 0 otherwise + */ +static int tidesdb_backup_sstable_in_manifest(const tidesdb_column_family_t *cf, const char *name) +{ + if (!cf || !cf->manifest || !name) return 0; + + int level_num = 0; + int partition_num = 0; + unsigned long long sst_id_ull = 0; + + if (tdb_parse_sstable_partitioned(name, &level_num, &partition_num, &sst_id_ull)) + { + return tidesdb_manifest_has_sstable(cf->manifest, level_num, (uint64_t)sst_id_ull); + } + + if (tdb_parse_sstable_non_partitioned(name, &level_num, &sst_id_ull)) + { + return tidesdb_manifest_has_sstable(cf->manifest, level_num, (uint64_t)sst_id_ull); + } + + return 0; +} + +/** + * tidesdb_backup_copy_file + * copies a single file from source to destination + * @param src_path source file path + * @param dst_path destination file path + * @return TDB_SUCCESS or TDB_ERR_IO + */ +static int tidesdb_backup_copy_file(const char *src_path, const char *dst_path) +{ + FILE *src = tdb_fopen(src_path, TDB_BUP_CPY_FILE_SRC_MODE); + if (!src) + { + /*** ENOENT file was deleted between readdir/stat and fopen + ** EACCES on Windows, file may be in NTFS "delete pending" state + * from concurrent compaction -- we treat as transient */ + if (errno == ENOENT || errno == EACCES) return TDB_SUCCESS; + return TDB_ERR_IO; + } + + FILE *dst = tdb_fopen(dst_path, TDB_BUP_CPY_FILE_DST_MODE); + if (!dst) + { + fclose(src); + return TDB_ERR_IO; + } + + char buffer[TDB_BACKUP_COPY_BUFFER_SIZE]; + size_t bytes_read = 0; + int result = TDB_SUCCESS; + + while ((bytes_read = fread(buffer, 1, sizeof(buffer), src)) > 0) + { + if (fwrite(buffer, 1, bytes_read, dst) != bytes_read) + { + result = TDB_ERR_IO; + break; + } + } + + if (ferror(src)) result = TDB_ERR_IO; + + if (fflush(dst) != 0) result = TDB_ERR_IO; + + if (fclose(dst) != 0) result = TDB_ERR_IO; + fclose(src); + + return result; +} + +/** + * tidesdb_backup_copy_dir + * copies a column family directory to backup destination + * @param src_dir source directory path + * @param dst_dir destination directory path + * @param mode copy mode (immutable or final) + * @param cf column family for manifest checks + * @return TDB_SUCCESS or error code + */ +static int tidesdb_backup_copy_dir(const char *src_dir, const char *dst_dir, + const tidesdb_backup_copy_mode_t mode, + const tidesdb_column_family_t *cf) +{ + struct STAT_STRUCT dst_st; + if (STAT_FUNC(dst_dir, &dst_st) != 0) + { + if (mkdir(dst_dir, TDB_DIR_PERMISSIONS) != 0) + { + return TDB_ERR_IO; + } + } + else if (!S_ISDIR(dst_st.st_mode)) + { + return TDB_ERR_IO; + } + + DIR *dir = opendir(src_dir); + if (!dir) return TDB_ERR_IO; + + struct dirent *entry; + int result = TDB_SUCCESS; + + while ((entry = readdir(dir)) != NULL) + { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; + if (strcmp(entry->d_name, TDB_LOCK_FILE) == 0) continue; + + const size_t src_len = strlen(src_dir) + strlen(PATH_SEPARATOR) + strlen(entry->d_name) + 1; + const size_t dst_len = strlen(dst_dir) + strlen(PATH_SEPARATOR) + strlen(entry->d_name) + 1; + char *src_path = malloc(src_len); + char *dst_path = malloc(dst_len); + if (!src_path || !dst_path) + { + free(src_path); + free(dst_path); + result = TDB_ERR_MEMORY; + break; + } + + snprintf(src_path, src_len, "%s%s%s", src_dir, PATH_SEPARATOR, entry->d_name); + snprintf(dst_path, dst_len, "%s%s%s", dst_dir, PATH_SEPARATOR, entry->d_name); + + struct STAT_STRUCT src_st; + if (STAT_FUNC(src_path, &src_st) != 0) + { + if (errno != ENOENT && errno != EACCES) result = TDB_ERR_IO; + free(src_path); + free(dst_path); + if (result != TDB_SUCCESS) break; + continue; + } + + if (S_ISDIR(src_st.st_mode)) + { + result = tidesdb_backup_copy_dir(src_path, dst_path, mode, cf); + } + else + { + const int is_sstable = tidesdb_backup_is_sstable_file(entry->d_name); + const int is_wal = tidesdb_backup_is_wal_file(entry->d_name); + int should_copy = 0; + + if (mode == TDB_BACKUP_COPY_IMMUTABLE) + { + if (is_wal) + { + should_copy = 0; + } + else if (is_sstable) + { + should_copy = tidesdb_backup_sstable_in_manifest(cf, entry->d_name); + } + else + { + should_copy = 1; + } + } + else + { + if (is_sstable) + { + struct STAT_STRUCT existing_st; + if (STAT_FUNC(dst_path, &existing_st) != 0) + { + should_copy = 1; + } + } + else + { + should_copy = 1; + } + } + + if (should_copy) result = tidesdb_backup_copy_file(src_path, dst_path); + } + + free(src_path); + free(dst_path); + + if (result != TDB_SUCCESS) break; + } + + closedir(dir); + return result; +} + +/** + * tidesdb_backup_copy_all_cfs + * copies all column family directories to backup destination + * @param db database handle + * @param dir backup destination directory + * @param mode copy mode (immutable or final) + * @return TDB_SUCCESS or error code + */ +static int tidesdb_backup_copy_all_cfs(tidesdb_t *db, const char *dir, + const tidesdb_backup_copy_mode_t mode) +{ + int result = TDB_SUCCESS; + + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (!cf) continue; + + char dst_dir[TDB_MAX_PATH_LEN]; + const int needed = + snprintf(dst_dir, sizeof(dst_dir), "%s" PATH_SEPARATOR "%s", dir, cf->name); + if (needed < 0 || (size_t)needed >= sizeof(dst_dir)) + { + result = TDB_ERR_IO; + break; + } + + result = tidesdb_backup_copy_dir(cf->directory, dst_dir, mode, cf); + if (result != TDB_SUCCESS) break; + } + pthread_rwlock_unlock(&db->cf_list_lock); + + return result; +} + +int tidesdb_backup(tidesdb_t *db, char *dir) +{ + if (!db || !dir) return TDB_ERR_INVALID_ARGS; + + const int wait_result = wait_for_open(db); + if (wait_result != TDB_SUCCESS) return wait_result; + + if (strcmp(db->db_path, dir) == 0) return TDB_ERR_INVALID_ARGS; + + struct STAT_STRUCT st; + if (STAT_FUNC(dir, &st) == 0) + { + if (!S_ISDIR(st.st_mode)) return TDB_ERR_INVALID_ARGS; + if (!is_directory_empty(dir)) return TDB_ERR_EXISTS; + } + else + { + if (mkdir(dir, TDB_DIR_PERMISSIONS) != 0) return TDB_ERR_IO; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Starting backup to directory '%s'", dir); + + /* we pause compaction for the whole backup so the file copy cannot race a + * compaction rewriting the manifest + sstable set into an inconsistent + * pair that recovery from the backup would then reject. */ + pthread_mutex_lock(&db->compaction_gate_lock); + db->compaction_paused = 1; + pthread_mutex_unlock(&db->compaction_gate_lock); + + int result = tidesdb_backup_copy_all_cfs(db, dir, TDB_BACKUP_COPY_IMMUTABLE); + if (result != TDB_SUCCESS) goto backup_unpause; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Flushing memtables before final backup copy"); + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + tidesdb_column_family_t *cf = db->column_families[i]; + if (!cf) continue; + + int wait_count = 0; + while (tidesdb_is_flushing(cf) && wait_count < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + wait_count++; + } + + result = tidesdb_flush_memtable_internal(cf, 0, 1); + if (result != TDB_SUCCESS) + { + pthread_rwlock_unlock(&db->cf_list_lock); + goto backup_unpause; + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Waiting for background flushes to complete"); + int flush_wait_count = 0; + pthread_rwlock_rdlock(&db->cf_list_lock); + while (1) + { + int any_flushing = 0; + size_t queue_size_val = 0; + + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i]) + { + if (tidesdb_is_flushing(db->column_families[i])) + { + any_flushing = 1; + break; + } + } + } + + if (db->flush_queue) + { + queue_size_val = queue_size(db->flush_queue); + } + + if (!any_flushing && queue_size_val == 0) + { + break; + } + + if (flush_wait_count % 1000 == 0 && flush_wait_count > 0) + { + TDB_DEBUG_LOG( + TDB_LOG_INFO, + "Still waiting for background flushes (waited %d seconds, queue_size=%zu)", + flush_wait_count / 1000, queue_size_val); + } + + pthread_rwlock_unlock(&db->cf_list_lock); + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + flush_wait_count++; + pthread_rwlock_rdlock(&db->cf_list_lock); + } + pthread_rwlock_unlock(&db->cf_list_lock); + + /* compaction is paused, so no new compaction can start. we drain the + * compactions that were already past the gate when we paused so the final + * copy sees a stable manifest + sstable set. */ + TDB_DEBUG_LOG(TDB_LOG_INFO, "Waiting for in-progress compactions to complete"); + while (atomic_load_explicit(&db->active_compactions, memory_order_acquire) > 0) + { + usleep(TDB_CLOSE_TXN_WAIT_SLEEP_US); + } + + result = tidesdb_backup_copy_all_cfs(db, dir, TDB_BACKUP_COPY_FINAL); + if (result == TDB_SUCCESS) + TDB_DEBUG_LOG(TDB_LOG_INFO, "Backup completed successfully in '%s'", dir); + +backup_unpause: + pthread_mutex_lock(&db->compaction_gate_lock); + db->compaction_paused = 0; + pthread_mutex_unlock(&db->compaction_gate_lock); + return result; +} + +/** + * tidesdb_checkpoint_link_or_copy + * try to hard link a file, falling back to copy if hard linking fails + * (e.g., cross-filesystem) + * @param src source file path + * @param dst destination file path + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_checkpoint_link_or_copy(const char *src, const char *dst) +{ + if (tdb_hardlink(src, dst) == 0) + { + return TDB_SUCCESS; + } + + return tidesdb_backup_copy_file(src, dst); +} + +/** + * tidesdb_checkpoint_ensure_parent_dir + * ensure the parent directory of a file path exists, creating it recursively if needed + * @param file_path the file path whose parent directory should exist + * @return TDB_SUCCESS on success, TDB_ERR_IO on failure + */ +static int tidesdb_checkpoint_ensure_parent_dir(const char *file_path) +{ + if (!file_path) return TDB_ERR_INVALID_ARGS; + + char *path_copy = tdb_strdup(file_path); + if (!path_copy) return TDB_ERR_MEMORY; + + char *start = path_copy + 1; +#ifdef _WIN32 + /* we skip drive letter prefix (e.g., "C:\") */ + if (((path_copy[0] >= 'A' && path_copy[0] <= 'Z') || + (path_copy[0] >= 'a' && path_copy[0] <= 'z')) && + path_copy[1] == ':' && path_copy[2] == PATH_SEPARATOR[0]) + { + start = path_copy + 3; + } +#endif + + /* we walk from the end to find each directory component and create it */ + for (char *p = start; *p; p++) + { + if (*p == PATH_SEPARATOR[0]) + { + *p = '\0'; + struct STAT_STRUCT st; + if (STAT_FUNC(path_copy, &st) != 0) + { + if (mkdir(path_copy, TDB_DIR_PERMISSIONS) != 0 && errno != EEXIST) + { + free(path_copy); + return TDB_ERR_IO; + } + } + *p = PATH_SEPARATOR[0]; + } + } + + free(path_copy); + return TDB_SUCCESS; +} + +int tidesdb_checkpoint(tidesdb_t *db, const char *checkpoint_dir) +{ + if (!db || !checkpoint_dir) return TDB_ERR_INVALID_ARGS; + + const int wait_result = wait_for_open(db); + if (wait_result != TDB_SUCCESS) return wait_result; + + if (strcmp(db->db_path, checkpoint_dir) == 0) return TDB_ERR_INVALID_ARGS; + + /* we create the checkpoint directory */ + struct STAT_STRUCT st; + if (STAT_FUNC(checkpoint_dir, &st) == 0) + { + if (!S_ISDIR(st.st_mode)) return TDB_ERR_INVALID_ARGS; + if (!is_directory_empty(checkpoint_dir)) return TDB_ERR_EXISTS; + } + else + { + if (mkdir(checkpoint_dir, TDB_DIR_PERMISSIONS) != 0) return TDB_ERR_IO; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Starting checkpoint to directory '%s'", checkpoint_dir); + + int result = TDB_SUCCESS; + + pthread_rwlock_rdlock(&db->cf_list_lock); + const int num_cfs = db->num_column_families; + pthread_rwlock_unlock(&db->cf_list_lock); + + for (int cf_idx = 0; cf_idx < num_cfs; cf_idx++) + { + pthread_rwlock_rdlock(&db->cf_list_lock); + if (cf_idx >= db->num_column_families) + { + pthread_rwlock_unlock(&db->cf_list_lock); + break; + } + tidesdb_column_family_t *cf = db->column_families[cf_idx]; + pthread_rwlock_unlock(&db->cf_list_lock); + + if (!cf) continue; + if (atomic_load_explicit(&cf->marked_for_deletion, memory_order_acquire)) continue; + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Checkpoint processing CF '%s'", cf->name); + + /*** we force flush memtable so all data is in sstables. + ** we retry in a loop because tidesdb_flush_memtable_internal uses a CAS on + * is_flushing -- if another thread (e.g. memory-pressure flush) holds it, + ** the call returns TDB_SUCCESS without actually flushing. we must keep + *** retrying until the active memtable is truly empty! */ + for (int flush_attempt = 0; flush_attempt < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 4; + flush_attempt++) + { + /* we wait for any in-flight flush to finish first */ + for (int i = 0; i < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS; i++) + { + if (!tidesdb_is_flushing(cf)) break; + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + } + + /* we check if memtable is already empty (flushed by another thread). + * pin the active under cf->active_mt_readers so a concurrent flush + * worker draining a just-rotated immutable cannot free the struct + * between our load and the skip_list deref */ + tidesdb_memtable_t *mt = NULL; + const int mt_pinned = + tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, &mt); + const int empty = + !mt_pinned || !mt->skip_list || skip_list_count_entries(mt->skip_list) == 0; + if (mt_pinned) tidesdb_immutable_memtable_unref(mt); + if (empty) break; + + result = tidesdb_flush_memtable_internal(cf, 0, 1); + if (result != TDB_SUCCESS && result != TDB_ERR_MEMORY) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint flush failed for CF '%s' (err=%d)", + cf->name, result); + return result; + } + + /** we wait for flush to complete, we check queue, admission flag, and worker busy + * to ensure the flush worker has fully finished I/O (not just dequeued) */ + for (int i = 0; i < TDB_COMPACTION_FLUSH_WAIT_MAX_ATTEMPTS * 2; i++) + { + if (queue_size(db->flush_queue) == 0 && + !atomic_load_explicit(&cf->is_flushing, memory_order_acquire) && + atomic_load_explicit(&db->flush_pending_count, memory_order_acquire) == 0) + { + break; + } + usleep(TDB_COMPACTION_FLUSH_WAIT_SLEEP_US); + } + } + + /* we halt compactions for this CF */ + for (int i = 0; i < TDB_CHECKPOINT_COMPACTION_WAIT_MAX_ATTEMPTS; i++) + { + int expected = 0; + if (atomic_compare_exchange_strong_explicit(&cf->is_compacting, &expected, 1, + memory_order_acquire, memory_order_relaxed)) + { + break; + } + /* compaction is running, we wait for it to finish */ + usleep(TDB_CHECKPOINT_COMPACTION_WAIT_SLEEP_US); + } + + /* we commit manifest to ensure it reflects current state */ + if (cf->manifest) + { + tidesdb_manifest_commit(cf->manifest, cf->manifest->path); + } + + /* we create CF directory in checkpoint */ + char cf_checkpoint_dir[TDB_MAX_PATH_LEN]; + snprintf(cf_checkpoint_dir, sizeof(cf_checkpoint_dir), "%s" PATH_SEPARATOR "%s", + checkpoint_dir, cf->name); + if (mkdir(cf_checkpoint_dir, TDB_DIR_PERMISSIONS) != 0 && errno != EEXIST) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint failed to create CF dir %s", + cf_checkpoint_dir); + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + return TDB_ERR_IO; + } + + /* we hard link all live sstable files */ + const int num_levels = atomic_load_explicit(&cf->num_active_levels, memory_order_acquire); + const size_t cf_dir_len = strlen(cf->directory); + + for (int level = 0; level < num_levels && result == TDB_SUCCESS; level++) + { + tidesdb_level_t *lvl = cf->levels[level]; + if (!lvl) continue; + + tidesdb_sstable_t **sstables = + atomic_load_explicit(&lvl->sstables, memory_order_acquire); + const int num_ssts = atomic_load_explicit(&lvl->num_sstables, memory_order_acquire); + + for (int s = 0; s < num_ssts && result == TDB_SUCCESS; s++) + { + tidesdb_sstable_t *sst = sstables[s]; + if (!sst) continue; + + /** we compute destination paths by replacing cf->directory prefix + * with cf_checkpoint_dir */ + const char *klog_rel = sst->klog_path + cf_dir_len; + const char *vlog_rel = sst->vlog_path + cf_dir_len; + + char dst_klog[TDB_MAX_PATH_LEN]; + char dst_vlog[TDB_MAX_PATH_LEN]; + snprintf(dst_klog, sizeof(dst_klog), "%s%s", cf_checkpoint_dir, klog_rel); + snprintf(dst_vlog, sizeof(dst_vlog), "%s%s", cf_checkpoint_dir, vlog_rel); + + /* we ensure level subdirectory exists in checkpoint */ + result = tidesdb_checkpoint_ensure_parent_dir(dst_klog); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint failed to create parent dir for %s", + dst_klog); + break; + } + + /* we hard link klog */ + result = tidesdb_checkpoint_link_or_copy(sst->klog_path, dst_klog); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint failed to link/copy klog %s", + sst->klog_path); + break; + } + + /* we hard link vlog */ + result = tidesdb_checkpoint_link_or_copy(sst->vlog_path, dst_vlog); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint failed to link/copy vlog %s", + sst->vlog_path); + break; + } + + TDB_DEBUG_LOG(TDB_LOG_DEBUG, "Checkpoint linked SSTable %" PRIu64 " on L%d", + sst->id, level + 1); + } + } + + /* we copy manifest file (small) */ + if (result == TDB_SUCCESS && cf->manifest) + { + char src_manifest[TDB_MAX_PATH_LEN]; + char dst_manifest[TDB_MAX_PATH_LEN]; + int n = snprintf(src_manifest, sizeof(src_manifest), "%s" PATH_SEPARATOR "%s", + cf->directory, TDB_COLUMN_FAMILY_MANIFEST_NAME); + if (n < 0 || (size_t)n >= sizeof(src_manifest)) + { + result = TDB_ERR_IO; + } + else + { + n = snprintf(dst_manifest, sizeof(dst_manifest), "%s" PATH_SEPARATOR "%s", + cf_checkpoint_dir, TDB_COLUMN_FAMILY_MANIFEST_NAME); + if (n < 0 || (size_t)n >= sizeof(dst_manifest)) + { + result = TDB_ERR_IO; + } + else + { + result = tidesdb_backup_copy_file(src_manifest, dst_manifest); + } + } + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint failed to copy manifest for CF '%s'", + cf->name); + } + } + + /* we copy config file (small) */ + if (result == TDB_SUCCESS) + { + char src_config[TDB_MAX_PATH_LEN]; + char dst_config[TDB_MAX_PATH_LEN]; + int n = snprintf( + src_config, sizeof(src_config), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + cf->directory); + if (n < 0 || (size_t)n >= sizeof(src_config)) + { + result = TDB_ERR_IO; + } + else + { + n = snprintf( + dst_config, sizeof(dst_config), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + cf_checkpoint_dir); + if (n < 0 || (size_t)n >= sizeof(dst_config)) + { + result = TDB_ERR_IO; + } + else + { + result = tidesdb_backup_copy_file(src_config, dst_config); + } + } + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Checkpoint failed to copy config for CF '%s'", + cf->name); + } + } + + /* we resume compactions */ + atomic_store_explicit(&cf->is_compacting, 0, memory_order_release); + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Checkpoint for CF '%s' done (levels=%d, result=%d)", cf->name, + num_levels, result); + + if (result != TDB_SUCCESS) return result; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Checkpoint completed successfully %s", checkpoint_dir); + return TDB_SUCCESS; +} + +/** + * tidesdb_clone_copy_cf_dir + * copy a column family directory to a new location, copying all files + * @param src_dir source directory + * @param dst_dir destination directory + * @return TDB_SUCCESS on success, error code on failure + */ +static int tidesdb_clone_copy_cf_dir(const char *src_dir, const char *dst_dir) +{ + struct STAT_STRUCT dst_st; + if (STAT_FUNC(dst_dir, &dst_st) != 0) + { + if (mkdir(dst_dir, TDB_DIR_PERMISSIONS) != 0) + { + return TDB_ERR_IO; + } + } + else if (!S_ISDIR(dst_st.st_mode)) + { + return TDB_ERR_IO; + } + + DIR *dir = opendir(src_dir); + if (!dir) return TDB_ERR_IO; + + struct dirent *entry; + int result = TDB_SUCCESS; + + while ((entry = readdir(dir)) != NULL) + { + if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0) continue; + if (strcmp(entry->d_name, TDB_LOCK_FILE) == 0) continue; + + /* we skip WAL files -- we don't want to copy uncommitted data */ + if (tidesdb_backup_is_wal_file(entry->d_name)) continue; + + const size_t src_len = strlen(src_dir) + strlen(PATH_SEPARATOR) + strlen(entry->d_name) + 1; + const size_t dst_len = strlen(dst_dir) + strlen(PATH_SEPARATOR) + strlen(entry->d_name) + 1; + char *src_path = malloc(src_len); + char *dst_path = malloc(dst_len); + if (!src_path || !dst_path) + { + free(src_path); + free(dst_path); + result = TDB_ERR_MEMORY; + break; + } + + snprintf(src_path, src_len, "%s%s%s", src_dir, PATH_SEPARATOR, entry->d_name); + snprintf(dst_path, dst_len, "%s%s%s", dst_dir, PATH_SEPARATOR, entry->d_name); + + struct STAT_STRUCT src_st; + if (STAT_FUNC(src_path, &src_st) != 0) + { + if (errno != ENOENT && errno != EACCES) result = TDB_ERR_IO; + free(src_path); + free(dst_path); + if (result != TDB_SUCCESS) break; + continue; + } + + if (S_ISDIR(src_st.st_mode)) + { + result = tidesdb_clone_copy_cf_dir(src_path, dst_path); + } + else + { + result = tidesdb_backup_copy_file(src_path, dst_path); + } + + free(src_path); + free(dst_path); + + if (result != TDB_SUCCESS) break; + } + + closedir(dir); + return result; +} + +int tidesdb_clone_column_family(tidesdb_t *db, const char *src_name, const char *dst_name) +{ + if (!db || !src_name || !dst_name) return TDB_ERR_INVALID_ARGS; + + const int wait_result = wait_for_open(db); + if (wait_result != TDB_SUCCESS) return wait_result; + + /* we validate names are different */ + if (strcmp(src_name, dst_name) == 0) return TDB_ERR_INVALID_ARGS; + + /* we check destination doesn't already exist */ + pthread_rwlock_rdlock(&db->cf_list_lock); + for (int i = 0; i < db->num_column_families; i++) + { + if (db->column_families[i] && strcmp(db->column_families[i]->name, dst_name) == 0) + { + pthread_rwlock_unlock(&db->cf_list_lock); + TDB_DEBUG_LOG(TDB_LOG_WARN, "Clone destination CF '%s' already exists", dst_name); + return TDB_ERR_EXISTS; + } + } + pthread_rwlock_unlock(&db->cf_list_lock); + + tidesdb_column_family_t *src_cf = tidesdb_get_column_family(db, src_name); + if (!src_cf) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Clone source CF '%s' not found", src_name); + return TDB_ERR_NOT_FOUND; + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Cloning column family '%s' to '%s'", src_name, dst_name); + + /* we wait for any in-progress flush to complete (check flag, queue, and worker busy) */ + int wait_count = 0; + while (tidesdb_is_flushing(src_cf) && wait_count < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + wait_count++; + } + + /* we flush the source memtable to ensure all data is on disk */ + int result = tidesdb_flush_memtable_internal(src_cf, 0, 1); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to flush source CF '%s' before clone", src_name); + return result; + } + + /* we wait for flush I/O to complete (queue drained and worker idle) */ + wait_count = 0; + while (tidesdb_is_flushing(src_cf) && wait_count < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + wait_count++; + } + + /* we wait for any in-progress compaction to complete */ + wait_count = 0; + while (tidesdb_is_compacting(src_cf) && wait_count < TDB_CLOSE_FLUSH_WAIT_MAX_ATTEMPTS) + { + usleep(TDB_CLOSE_FLUSH_WAIT_SLEEP_US); + wait_count++; + } + + char dst_dir[TDB_MAX_PATH_LEN]; + snprintf(dst_dir, sizeof(dst_dir), "%s" PATH_SEPARATOR "%s", db->db_path, dst_name); + + /* we check destination directory doesn't exist */ + struct STAT_STRUCT st; + if (STAT_FUNC(dst_dir, &st) == 0) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Clone destination directory '%s' already exists", dst_dir); + return TDB_ERR_EXISTS; + } + + /* we copy all files from source to destination */ + result = tidesdb_clone_copy_cf_dir(src_cf->directory, dst_dir); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to copy CF directory from '%s' to '%s'", + src_cf->directory, dst_dir); + /* we attempt cleanup */ + remove_directory(dst_dir); + return result; + } + + /* we update config.ini with new path */ + char config_path[TDB_MAX_PATH_LEN]; + const int config_written = snprintf( + config_path, sizeof(config_path), + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, dst_dir); + + if (config_written < 0 || (size_t)config_written >= sizeof(config_path)) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Config path too long for cloned CF '%s'", dst_name); + remove_directory(dst_dir); + return TDB_ERR_INVALID_ARGS; + } + + result = tidesdb_cf_config_save_to_ini(config_path, dst_name, &src_cf->config); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Failed to save config for cloned CF '%s' (error: %d)", + dst_name, result); + /* non-fatal, continue */ + } + + tdb_sync_directory(dst_dir); + + /* we create the new column family structure by loading from disk */ + tidesdb_column_family_config_t clone_config = src_cf->config; + + /* we clear cached comparator pointers -- they will be re-resolved */ + clone_config.comparator_fn_cached = NULL; + clone_config.comparator_ctx_cached = NULL; + + result = tidesdb_create_column_family(db, dst_name, &clone_config); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to create cloned CF structure '%s' (error: %d)", + dst_name, result); + remove_directory(dst_dir); + return result; + } + + /* we get the newly created CF and recover its sstables */ + tidesdb_column_family_t *dst_cf = tidesdb_get_column_family(db, dst_name); + if (dst_cf) + { + /* we recover ssts from the copied files */ + result = tidesdb_recover_sstables(dst_cf); + if (result != TDB_SUCCESS) + { + TDB_DEBUG_LOG(TDB_LOG_ERROR, "Failed to recover SSTables for cloned CF '%s'", dst_name); + /* CF is created but may be incomplete. the user should drop and retry.. */ + return result; + } + + /** we update next_sstable_id to prevent overwriting recovered sstables + * we scan all levels to find the maximum sstable ID */ + uint64_t max_sst_id = 0; + const int num_levels = + atomic_load_explicit(&dst_cf->num_active_levels, memory_order_acquire); + for (int level_idx = 0; level_idx < num_levels; level_idx++) + { + tidesdb_level_t *level = dst_cf->levels[level_idx]; + if (!level) continue; + + tidesdb_sstable_t **sstables = + atomic_load_explicit(&level->sstables, memory_order_acquire); + const int num_ssts = atomic_load_explicit(&level->num_sstables, memory_order_acquire); + + for (int sst_idx = 0; sst_idx < num_ssts; sst_idx++) + { + tidesdb_sstable_t *sst = sstables[sst_idx]; + if (sst && sst->id >= max_sst_id) + { + max_sst_id = sst->id + 1; + } + } + } + + if (max_sst_id > atomic_load(&dst_cf->next_sstable_id)) + { + atomic_store(&dst_cf->next_sstable_id, max_sst_id); + TDB_DEBUG_LOG(TDB_LOG_INFO, + "CF '%s' updated next_sstable_id to %" PRIu64 " after clone", dst_name, + max_sst_id); + } + + TDB_DEBUG_LOG(TDB_LOG_INFO, "Successfully cloned CF '%s' to '%s'", src_name, dst_name); + } + + return TDB_SUCCESS; +} + +/* on-disk config.ini keys -- a single source of truth shared by + * ini_config_handler (load) and tidesdb_cf_config_save_to_ini (save) so a + * typo in one cannot silently desync the pair and drop a field on reload */ +#define TDB_INI_KEY_WRITE_BUFFER_SIZE "write_buffer_size" +#define TDB_INI_KEY_LEVEL_SIZE_RATIO "level_size_ratio" +#define TDB_INI_KEY_MIN_LEVELS "min_levels" +#define TDB_INI_KEY_DIVIDING_LEVEL_OFFSET "dividing_level_offset" +#define TDB_INI_KEY_VALUE_THRESHOLD "value_threshold" +#define TDB_INI_KEY_COMPRESSION_ALGORITHM "compression_algorithm" +#define TDB_INI_KEY_ENABLE_BLOOM_FILTER "enable_bloom_filter" +#define TDB_INI_KEY_BLOOM_FPR "bloom_fpr" +#define TDB_INI_KEY_ENABLE_BLOCK_INDEXES "enable_block_indexes" +#define TDB_INI_KEY_INDEX_SAMPLE_RATIO "index_sample_ratio" +#define TDB_INI_KEY_BLOCK_INDEX_PREFIX_LEN "block_index_prefix_len" +#define TDB_INI_KEY_SYNC_MODE "sync_mode" +#define TDB_INI_KEY_SYNC_INTERVAL_US "sync_interval_us" +#define TDB_INI_KEY_SKIP_LIST_MAX_LEVEL "skip_list_max_level" +#define TDB_INI_KEY_SKIP_LIST_PROBABILITY "skip_list_probability" +#define TDB_INI_KEY_DEFAULT_ISOLATION_LEVEL "default_isolation_level" +#define TDB_INI_KEY_L1_FILE_COUNT_TRIGGER "l1_file_count_trigger" +#define TDB_INI_KEY_L0_QUEUE_STALL_THRESHOLD "l0_queue_stall_threshold" +#define TDB_INI_KEY_TOMBSTONE_DENSITY_TRIGGER "tombstone_density_trigger" +#define TDB_INI_KEY_TOMBSTONE_DENSITY_MIN_ENTRIES "tombstone_density_min_entries" +#define TDB_INI_KEY_MIN_DISK_SPACE "min_disk_space" +#define TDB_INI_KEY_USE_BTREE "use_btree" +#define TDB_INI_KEY_OBJECT_LAZY_COMPACTION "object_lazy_compaction" +#define TDB_INI_KEY_OBJECT_PREFETCH_COMPACTION "object_prefetch_compaction" +#define TDB_INI_KEY_COMPARATOR_NAME "comparator_name" +#define TDB_INI_KEY_COMPARATOR_CTX_STR "comparator_ctx_str" + +/* compression_algorithm values as written/read in config.ini */ +#define TDB_INI_VAL_COMPRESS_NONE "NONE" +#define TDB_INI_VAL_COMPRESS_LZ4 "LZ4" +#define TDB_INI_VAL_COMPRESS_LZ4_FAST "LZ4_FAST" +#define TDB_INI_VAL_COMPRESS_ZSTD "ZSTD" +#define TDB_INI_VAL_COMPRESS_SNAPPY "SNAPPY" + +/** + * ini_config_context_t + * INI configuration handler context + * @param config + * @param target_section + */ +typedef struct +{ + tidesdb_column_family_config_t *config; + const char *target_section; +} ini_config_context_t; + +/** + * ini_config_handler + * INI parser handler for loading configuration + * @param user + * @param section + * @param name + * @param value + * @return int + */ +static int ini_config_handler(void *user, const char *section, const char *name, const char *value) +{ + ini_config_context_t *ctx = (ini_config_context_t *)user; + + /* we only process our target section */ + if (strcmp(section, ctx->target_section) != 0) + { + return 1; /* continue parsing */ + } + + if (strcmp(name, TDB_INI_KEY_WRITE_BUFFER_SIZE) == 0) + { + ctx->config->write_buffer_size = (size_t)strtoll(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_LEVEL_SIZE_RATIO) == 0) + { + ctx->config->level_size_ratio = (size_t)strtoll(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_MIN_LEVELS) == 0) + { + ctx->config->min_levels = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_DIVIDING_LEVEL_OFFSET) == 0) + { + ctx->config->dividing_level_offset = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_VALUE_THRESHOLD) == 0) + { + ctx->config->klog_value_threshold = (size_t)strtoll(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_COMPRESSION_ALGORITHM) == 0) + { + if (strcmp(value, TDB_INI_VAL_COMPRESS_NONE) == 0) + ctx->config->compression_algorithm = TDB_COMPRESS_NONE; + else if (strcmp(value, TDB_INI_VAL_COMPRESS_LZ4) == 0) + ctx->config->compression_algorithm = TDB_COMPRESS_LZ4; + else if (strcmp(value, TDB_INI_VAL_COMPRESS_LZ4_FAST) == 0) + ctx->config->compression_algorithm = TDB_COMPRESS_LZ4_FAST; + else if (strcmp(value, TDB_INI_VAL_COMPRESS_ZSTD) == 0) + ctx->config->compression_algorithm = TDB_COMPRESS_ZSTD; +#ifndef __sun + else if (strcmp(value, TDB_INI_VAL_COMPRESS_SNAPPY) == 0) + ctx->config->compression_algorithm = TDB_COMPRESS_SNAPPY; +#endif + } + else if (strcmp(name, TDB_INI_KEY_ENABLE_BLOOM_FILTER) == 0) + { + ctx->config->enable_bloom_filter = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_BLOOM_FPR) == 0) + { + ctx->config->bloom_fpr = strtod(value, NULL); + } + else if (strcmp(name, TDB_INI_KEY_ENABLE_BLOCK_INDEXES) == 0) + { + ctx->config->enable_block_indexes = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_INDEX_SAMPLE_RATIO) == 0) + { + ctx->config->index_sample_ratio = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_BLOCK_INDEX_PREFIX_LEN) == 0) + { + ctx->config->block_index_prefix_len = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_SYNC_MODE) == 0) + { + ctx->config->sync_mode = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_SYNC_INTERVAL_US) == 0) + { + ctx->config->sync_interval_us = (size_t)strtoll(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_SKIP_LIST_MAX_LEVEL) == 0) + { + ctx->config->skip_list_max_level = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_SKIP_LIST_PROBABILITY) == 0) + { + ctx->config->skip_list_probability = (float)strtod(value, NULL); + } + else if (strcmp(name, TDB_INI_KEY_DEFAULT_ISOLATION_LEVEL) == 0) + { + const int level = (int)strtol(value, NULL, 10); + if (level >= TDB_ISOLATION_READ_UNCOMMITTED && level <= TDB_ISOLATION_SERIALIZABLE) + { + ctx->config->default_isolation_level = (tidesdb_isolation_level_t)level; + } + } + else if (strcmp(name, TDB_INI_KEY_L1_FILE_COUNT_TRIGGER) == 0) + { + ctx->config->l1_file_count_trigger = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_L0_QUEUE_STALL_THRESHOLD) == 0) + { + ctx->config->l0_queue_stall_threshold = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_TOMBSTONE_DENSITY_TRIGGER) == 0) + { + ctx->config->tombstone_density_trigger = strtod(value, NULL); + } + else if (strcmp(name, TDB_INI_KEY_TOMBSTONE_DENSITY_MIN_ENTRIES) == 0) + { + ctx->config->tombstone_density_min_entries = (uint64_t)strtoull(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_MIN_DISK_SPACE) == 0) + { + ctx->config->min_disk_space = (uint64_t)strtoull(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_USE_BTREE) == 0) + { + ctx->config->use_btree = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_OBJECT_LAZY_COMPACTION) == 0) + { + ctx->config->object_lazy_compaction = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_OBJECT_PREFETCH_COMPACTION) == 0) + { + ctx->config->object_prefetch_compaction = (int)strtol(value, NULL, 10); + } + else if (strcmp(name, TDB_INI_KEY_COMPARATOR_NAME) == 0) + { + strncpy(ctx->config->comparator_name, value, TDB_MAX_COMPARATOR_NAME - 1); + ctx->config->comparator_name[TDB_MAX_COMPARATOR_NAME - 1] = '\0'; + } + else if (strcmp(name, TDB_INI_KEY_COMPARATOR_CTX_STR) == 0) + { + strncpy(ctx->config->comparator_ctx_str, value, TDB_MAX_COMPARATOR_CTX - 1); + ctx->config->comparator_ctx_str[TDB_MAX_COMPARATOR_CTX - 1] = '\0'; + } + + return 1; /* continue parsing */ +} + +int tidesdb_cf_config_load_from_ini(const char *ini_file, const char *section_name, + tidesdb_column_family_config_t *config) +{ + if (!ini_file || !section_name || !config) return TDB_ERR_INVALID_ARGS; + + *config = tidesdb_default_column_family_config(); + + ini_config_context_t ctx = {.config = config, .target_section = section_name}; + + const int result = ini_parse(ini_file, ini_config_handler, &ctx); + if (result < 0) + { + return TDB_ERR_IO; /* failed to open or parse */ + } + if (result > 0) + { + return TDB_ERR_CORRUPTION; + } + + return TDB_SUCCESS; +} + +int tidesdb_cf_config_save_to_ini(const char *ini_file, const char *section_name, + const tidesdb_column_family_config_t *config) +{ + if (!ini_file || !section_name || !config) return TDB_ERR_INVALID_ARGS; + + FILE *fp = fopen(ini_file, TDB_CNF_FILE_MODE); + if (!fp) return TDB_ERR_IO; + + fprintf(fp, "[%s]\n", section_name); + + fprintf(fp, TDB_INI_KEY_WRITE_BUFFER_SIZE " = %zu\n", config->write_buffer_size); + fprintf(fp, TDB_INI_KEY_LEVEL_SIZE_RATIO " = %zu\n", config->level_size_ratio); + fprintf(fp, TDB_INI_KEY_MIN_LEVELS " = %d\n", config->min_levels); + fprintf(fp, TDB_INI_KEY_DIVIDING_LEVEL_OFFSET " = %d\n", config->dividing_level_offset); + fprintf(fp, TDB_INI_KEY_VALUE_THRESHOLD " = %zu\n", config->klog_value_threshold); + + const char *compression_str = TDB_INI_VAL_COMPRESS_NONE; + switch (config->compression_algorithm) + { + case TDB_COMPRESS_NONE: + compression_str = TDB_INI_VAL_COMPRESS_NONE; + break; + case TDB_COMPRESS_LZ4: + compression_str = TDB_INI_VAL_COMPRESS_LZ4; + break; + case TDB_COMPRESS_LZ4_FAST: + compression_str = TDB_INI_VAL_COMPRESS_LZ4_FAST; + break; + case TDB_COMPRESS_ZSTD: + compression_str = TDB_INI_VAL_COMPRESS_ZSTD; + break; +#ifndef __sun + case TDB_COMPRESS_SNAPPY: + compression_str = TDB_INI_VAL_COMPRESS_SNAPPY; + break; +#endif + } + fprintf(fp, TDB_INI_KEY_COMPRESSION_ALGORITHM " = %s\n", compression_str); + + fprintf(fp, TDB_INI_KEY_ENABLE_BLOOM_FILTER " = %d\n", config->enable_bloom_filter); + fprintf(fp, TDB_INI_KEY_BLOOM_FPR " = %f\n", config->bloom_fpr); + fprintf(fp, TDB_INI_KEY_ENABLE_BLOCK_INDEXES " = %d\n", config->enable_block_indexes); + fprintf(fp, TDB_INI_KEY_INDEX_SAMPLE_RATIO " = %d\n", config->index_sample_ratio); + fprintf(fp, TDB_INI_KEY_BLOCK_INDEX_PREFIX_LEN " = %d\n", config->block_index_prefix_len); + fprintf(fp, TDB_INI_KEY_SYNC_MODE " = %d\n", config->sync_mode); + fprintf(fp, TDB_INI_KEY_SYNC_INTERVAL_US " = %" PRIu64 "\n", config->sync_interval_us); + fprintf(fp, TDB_INI_KEY_SKIP_LIST_MAX_LEVEL " = %d\n", config->skip_list_max_level); + fprintf(fp, TDB_INI_KEY_SKIP_LIST_PROBABILITY " = %f\n", config->skip_list_probability); + fprintf(fp, TDB_INI_KEY_DEFAULT_ISOLATION_LEVEL " = %d\n", config->default_isolation_level); + fprintf(fp, TDB_INI_KEY_L1_FILE_COUNT_TRIGGER " = %d\n", config->l1_file_count_trigger); + fprintf(fp, TDB_INI_KEY_L0_QUEUE_STALL_THRESHOLD " = %d\n", config->l0_queue_stall_threshold); + fprintf(fp, TDB_INI_KEY_TOMBSTONE_DENSITY_TRIGGER " = %f\n", config->tombstone_density_trigger); + fprintf(fp, TDB_INI_KEY_TOMBSTONE_DENSITY_MIN_ENTRIES " = %" PRIu64 "\n", + config->tombstone_density_min_entries); + fprintf(fp, TDB_INI_KEY_MIN_DISK_SPACE " = %" PRIu64 "\n", config->min_disk_space); + fprintf(fp, TDB_INI_KEY_USE_BTREE " = %d\n", config->use_btree); + fprintf(fp, TDB_INI_KEY_OBJECT_LAZY_COMPACTION " = %d\n", config->object_lazy_compaction); + fprintf(fp, TDB_INI_KEY_OBJECT_PREFETCH_COMPACTION " = %d\n", + config->object_prefetch_compaction); + + fprintf(fp, TDB_INI_KEY_COMPARATOR_NAME " = %s\n", config->comparator_name); + if (config->comparator_ctx_str[0] != '\0') + { + fprintf(fp, TDB_INI_KEY_COMPARATOR_CTX_STR " = %s\n", config->comparator_ctx_str); + } + + fflush(fp); + const int fd = tdb_fileno(fp); + if (fd >= 0) + { + fsync(fd); + } + fclose(fp); + + const char *last_sep = strrchr(ini_file, PATH_SEPARATOR[0]); + if (last_sep) + { + char parent_dir[TDB_MAX_PATH_LEN]; + const size_t parent_len = last_sep - ini_file; + if (parent_len < TDB_MAX_PATH_LEN) + { + memcpy(parent_dir, ini_file, parent_len); + parent_dir[parent_len] = '\0'; + tdb_sync_directory(parent_dir); + } + } + + return TDB_SUCCESS; +} + +int tidesdb_cf_update_runtime_config(tidesdb_column_family_t *cf, + const tidesdb_column_family_config_t *new_config, + const int persist_to_disk) +{ + if (!cf || !new_config) return TDB_ERR_INVALID_ARGS; + + cf->config.enable_bloom_filter = new_config->enable_bloom_filter; + cf->config.bloom_fpr = new_config->bloom_fpr; + cf->config.enable_block_indexes = new_config->enable_block_indexes; + cf->config.index_sample_ratio = new_config->index_sample_ratio; + cf->config.block_index_prefix_len = new_config->block_index_prefix_len; + cf->config.compression_algorithm = new_config->compression_algorithm; + cf->config.write_buffer_size = new_config->write_buffer_size; + cf->config.level_size_ratio = new_config->level_size_ratio; + cf->config.min_levels = new_config->min_levels; + cf->config.dividing_level_offset = new_config->dividing_level_offset; + cf->config.sync_mode = new_config->sync_mode; + cf->config.sync_interval_us = new_config->sync_interval_us; + cf->config.klog_value_threshold = new_config->klog_value_threshold; + cf->config.default_isolation_level = new_config->default_isolation_level; + cf->config.skip_list_max_level = new_config->skip_list_max_level; + cf->config.skip_list_probability = new_config->skip_list_probability; + cf->config.l1_file_count_trigger = new_config->l1_file_count_trigger; + cf->config.l0_queue_stall_threshold = new_config->l0_queue_stall_threshold; + cf->config.tombstone_density_trigger = new_config->tombstone_density_trigger; + cf->config.tombstone_density_min_entries = new_config->tombstone_density_min_entries; + cf->config.min_disk_space = new_config->min_disk_space; + cf->config.commit_hook_fn = new_config->commit_hook_fn; + cf->config.commit_hook_ctx = new_config->commit_hook_ctx; + + tidesdb_memtable_t *mt = atomic_load_explicit(&cf->active_memtable, memory_order_acquire); + if (mt && mt->wal) + { + block_manager_set_sync_mode(mt->wal, new_config->sync_mode); + } + + if (persist_to_disk) + { + char config_path[MAX_FILE_PATH_LENGTH]; + snprintf(config_path, sizeof(config_path), + "%s" PATH_SEPARATOR + "%s" PATH_SEPARATOR TDB_COLUMN_FAMILY_CONFIG_NAME TDB_COLUMN_FAMILY_CONFIG_EXT, + cf->db->config.db_path, cf->name); + + const int result = tidesdb_cf_config_save_to_ini(config_path, cf->name, &cf->config); + if (result != TDB_SUCCESS) + { + return result; + } + } + + return TDB_SUCCESS; +} + +int tidesdb_cf_set_commit_hook(tidesdb_column_family_t *cf, tidesdb_commit_hook_fn fn, void *ctx) +{ + if (!cf) return TDB_ERR_INVALID_ARGS; + + cf->config.commit_hook_fn = fn; + cf->config.commit_hook_ctx = ctx; + + return TDB_SUCCESS; +} + +/** + * compact_block_index_create + * creates a new block index for fast key-to-block lookups in sstables + * @param initial_capacity initial number of index entries + * @param prefix_len length of key prefixes to store + * @param comparator comparator function for key ordering + * @param comparator_ctx context for comparator + * @return new block index, or NULL on failure + */ +static tidesdb_block_index_t *compact_block_index_create(uint32_t initial_capacity, + uint8_t prefix_len, + const tidesdb_comparator_fn comparator, + void *comparator_ctx) +{ + if (initial_capacity == 0) initial_capacity = TDB_INITIAL_BLOCK_INDEX_CAPACITY; + if (prefix_len < TDB_BLOCK_INDEX_PREFIX_MIN) prefix_len = TDB_DEFAULT_BLOCK_INDEX_PREFIX_LEN; + + tidesdb_block_index_t *index = calloc(1, sizeof(tidesdb_block_index_t)); + if (!index) return NULL; + + index->min_key_prefixes = malloc(initial_capacity * prefix_len); + index->max_key_prefixes = malloc(initial_capacity * prefix_len); + index->file_positions = malloc(initial_capacity * sizeof(uint64_t)); + + if (!index->min_key_prefixes || !index->max_key_prefixes || !index->file_positions) + { + compact_block_index_free(index); + return NULL; + } + + index->capacity = initial_capacity; + index->count = 0; + index->prefix_len = prefix_len; + index->comparator = comparator; + index->comparator_ctx = comparator_ctx; + + return index; +} + +/** + * compact_block_index_serialize + * serializes a block index to a byte buffer for writing to disk + * @param index block index to serialize + * @param out_size output parameter for serialized size + * @return serialized data (caller must free), or NULL on failure + */ +static uint8_t *compact_block_index_serialize(const tidesdb_block_index_t *index, size_t *out_size) +{ + if (!index || !out_size) return NULL; + + /** header + * count (4) + prefix_len (1) + file_positions (varint) + min/max prefixes */ + const size_t max_size = sizeof(uint32_t) + sizeof(uint8_t) + + index->count * 10 + /* file_positions (varint) */ + index->count * index->prefix_len * 2; /* min + max prefixes */ + + uint8_t *data = malloc(max_size); + if (!data) return NULL; + + uint8_t *ptr = data; + + /** header + * count + prefix_len */ + encode_uint32_le_compat(ptr, index->count); + ptr += sizeof(uint32_t); + *ptr++ = index->prefix_len; + + /* delta encode + varint compress file_positions */ + if (index->count > 0) + { + /* first file position stored as-is */ + ptr += encode_varint(ptr, index->file_positions[0]); + + /* remaining file positions stored as deltas */ + for (uint32_t i = 1; i < index->count; i++) + { + const uint64_t delta = index->file_positions[i] - index->file_positions[i - 1]; + ptr += encode_varint(ptr, delta); + } + } + + const size_t prefix_bytes = index->count * index->prefix_len; + memcpy(ptr, index->min_key_prefixes, prefix_bytes); + ptr += prefix_bytes; + memcpy(ptr, index->max_key_prefixes, prefix_bytes); + ptr += prefix_bytes; + + /* we calc actual size and shrink buffer */ + const size_t actual_size = ptr - data; + uint8_t *final_data = realloc(data, actual_size); + if (!final_data) + { + /* realloc failed, but the original data is still valid */ + *out_size = actual_size; + return data; + } + + *out_size = actual_size; + return final_data; +} + +/** + * compact_block_index_deserialize + * deserializes a block index from a byte buffer read from disk + * @param data serialized data + * @param data_size size of serialized data + * @return deserialized block index, or NULL on failure + */ +static tidesdb_block_index_t *compact_block_index_deserialize(const uint8_t *data, + const size_t data_size) +{ + if (!data || data_size < sizeof(uint32_t) + sizeof(uint8_t)) return NULL; + + const uint8_t *ptr = data; + const uint8_t *end = data + data_size; + + /* we read header + * count + prefix_len */ + const uint32_t count = decode_uint32_le_compat(ptr); + ptr += sizeof(uint32_t); + const uint8_t prefix_len = *ptr++; + + if (prefix_len < TDB_BLOCK_INDEX_PREFIX_MIN) + { + TDB_DEBUG_LOG( + TDB_LOG_WARN, + "Block index deserialization failed with invalid prefix_len=%u (must be %d-%d)", + prefix_len, TDB_BLOCK_INDEX_PREFIX_MIN, TDB_BLOCK_INDEX_PREFIX_MAX); + return NULL; /* invalid format */ + } + + if (count > TDB_BLOCK_INDEX_MAX_COUNT) + { + TDB_DEBUG_LOG(TDB_LOG_WARN, "Block index deserialization failed with unreasonable count=%u", + count); + return NULL; + } + + tidesdb_block_index_t *index = calloc(1, sizeof(tidesdb_block_index_t)); + if (!index) return NULL; + + /* we handle empty index (count = 0) */ + if (count == 0) + { + index->count = 0; + index->capacity = 0; + index->prefix_len = prefix_len; + index->min_key_prefixes = NULL; + index->max_key_prefixes = NULL; + index->file_positions = NULL; + return index; + } + + index->min_key_prefixes = malloc(count * prefix_len); + index->max_key_prefixes = malloc(count * prefix_len); + index->file_positions = malloc(count * sizeof(uint64_t)); + + if (!index->min_key_prefixes || !index->max_key_prefixes || !index->file_positions) + { + compact_block_index_free(index); + return NULL; + } + + /* we decode file_positions (delta-encoded varints) */ + if (count > 0) + { + uint64_t value; + + int bytes_read = decode_varint(ptr, &value, (int)(end - ptr)); + if (bytes_read < 0) goto error; + index->file_positions[0] = value; + ptr += bytes_read; + + /* remaining file positions (deltas) */ + for (uint32_t i = 1; i < count; i++) + { + uint64_t delta; + bytes_read = decode_varint(ptr, &delta, (int)(end - ptr)); + if (bytes_read < 0) goto error; + ptr += bytes_read; + index->file_positions[i] = index->file_positions[i - 1] + delta; + } + } + + const size_t prefix_bytes = count * prefix_len; + if (ptr + prefix_bytes > end) goto error; + memcpy(index->min_key_prefixes, ptr, prefix_bytes); + ptr += prefix_bytes; + + if (ptr + prefix_bytes > end) goto error; + memcpy(index->max_key_prefixes, ptr, prefix_bytes); + ptr += prefix_bytes; + + index->count = count; + index->capacity = count; + index->prefix_len = prefix_len; + index->comparator = NULL; + index->comparator_ctx = NULL; + + return index; + +error: + compact_block_index_free(index); + return NULL; +} + +/** + * compact_block_index_add + * add a new entry to the block index + * @param index block index + * @param min_key minimum key in block + * @param min_key_len length of minimum key + * @param max_key maximum key in block + * @param max_key_len length of maximum key + * @param file_position position of block in file + * @return 0 on success, -1 on error + */ +static int compact_block_index_add(tidesdb_block_index_t *index, const uint8_t *min_key, + const size_t min_key_len, const uint8_t *max_key, + const size_t max_key_len, const uint64_t file_position) +{ + if (!index || !min_key || !max_key) return -1; + + if (index->count >= index->capacity) + { + const uint32_t new_capacity = index->capacity * 2; + + /** we must handle realloc failures carefully to avoid memory leaks + * if any realloc fails, we keep the original pointers intact */ + uint8_t *new_min = realloc(index->min_key_prefixes, new_capacity * index->prefix_len); + if (!new_min) return -1; + index->min_key_prefixes = new_min; + + uint8_t *new_max = realloc(index->max_key_prefixes, new_capacity * index->prefix_len); + if (!new_max) return -1; + index->max_key_prefixes = new_max; + + uint64_t *new_positions = realloc(index->file_positions, new_capacity * sizeof(uint64_t)); + if (!new_positions) return -1; + index->file_positions = new_positions; + + index->capacity = new_capacity; + } + + const size_t min_copy_len = (min_key_len < index->prefix_len) ? min_key_len : index->prefix_len; + const size_t max_copy_len = (max_key_len < index->prefix_len) ? max_key_len : index->prefix_len; + + uint8_t *min_dest = index->min_key_prefixes + (index->count * index->prefix_len); + uint8_t *max_dest = index->max_key_prefixes + (index->count * index->prefix_len); + + memcpy(min_dest, min_key, min_copy_len); + if (min_copy_len < index->prefix_len) + { + memset(min_dest + min_copy_len, 0, index->prefix_len - min_copy_len); + } + + memcpy(max_dest, max_key, max_copy_len); + if (max_copy_len < index->prefix_len) + { + memset(max_dest + max_copy_len, 0, index->prefix_len - max_copy_len); + } + + index->file_positions[index->count] = file_position; + index->count++; + + return 0; +} + +/** + * compact_block_index_find_slot + * finds the leftmost block that could contain the given key using binary search + * + * the block index is lossy, it stores only the first prefix_len bytes of each + * block's min/max key. when several keys share a prefix longer than prefix_len + * they can span multiple klog blocks that all have identical min/max prefixes. + * returning the rightmost prefix match would overshoot the block that actually + * holds the key, so this finds the leftmost block whose max prefix is >= the + * search prefix -- the first block that could hold the key or a key after it. + * callers needing a definitive answer scan the prefix-colliding run forward + * from here via compact_block_index_run_length. + * + * @param index the block index to search + * @param key the search key + * @param key_len length of the search key + * @param slot output parameter for the found slot number + * @return 0 on success, -1 if no suitable slot found + */ +static int compact_block_index_find_slot(const tidesdb_block_index_t *index, const uint8_t *key, + const size_t key_len, int64_t *slot) +{ + if (!index || !key || index->count == 0 || !slot) return -1; + + uint8_t search_prefix[TDB_BLOCK_INDEX_PREFIX_MAX]; + const size_t copy_len = (key_len < index->prefix_len) ? key_len : index->prefix_len; + memcpy(search_prefix, key, copy_len); + if (copy_len < index->prefix_len) + { + memset(search_prefix + copy_len, 0, index->prefix_len - copy_len); + } + + int64_t left = 0; + int64_t right = (int64_t)index->count - 1; + int64_t candidate = -1; + + while (left <= right) + { + const int64_t mid = left + (right - left) / 2; + const uint8_t *mid_max_prefix = index->max_key_prefixes + (mid * index->prefix_len); + + int cmp_max; + if (index->comparator) + { + cmp_max = index->comparator(search_prefix, index->prefix_len, mid_max_prefix, + index->prefix_len, index->comparator_ctx); + } + else + { + cmp_max = memcmp(search_prefix, mid_max_prefix, index->prefix_len); + } + + if (cmp_max <= 0) + { + /* search_prefix <= max_prefix[mid] -- mid could hold the key; keep it + * and look left for an earlier block that also could */ + candidate = mid; + right = mid - 1; + } + else + { + /* search_prefix > max_prefix[mid] -- the key sorts past this block */ + left = mid + 1; + } + } + + /* if no block has max_prefix >= search_prefix the key sorts past every + * indexed block; fall back to the last block so iterators position at the + * end and point lookups scan it and find nothing */ + *slot = (candidate >= 0) ? candidate : (int64_t)index->count - 1; + return 0; +} + +/** + * compact_block_index_find_predecessor + * thin wrapper over compact_block_index_find_slot that returns the file + * position of the leftmost block that could contain the key + * + * @param index the block index to search + * @param key the search key + * @param key_len length of the search key + * @param file_position output parameter for the found block file position + * @return 0 on success, -1 if no suitable block found + */ +static int compact_block_index_find_predecessor(const tidesdb_block_index_t *index, + const uint8_t *key, const size_t key_len, + uint64_t *file_position) +{ + int64_t slot = 0; + if (compact_block_index_find_slot(index, key, key_len, &slot) != 0) return -1; + *file_position = index->file_positions[slot]; + return 0; +} + +/** + * compact_block_index_run_length + * counts the prefix-colliding run starting at start_slot -- the number of + * consecutive blocks whose min prefix is <= the search prefix. because the + * prefix index is lossy a definitive point lookup must scan every block in + * this run, not just the first, since the index cannot tell which one holds + * the key. returns at least 1 so the caller always scans the starting block. + * + * @param index the block index to search + * @param key the search key + * @param key_len length of the search key + * @param start_slot leftmost candidate slot from compact_block_index_find_slot + * @return number of consecutive candidate blocks, 0 if start_slot is invalid + */ +static uint32_t compact_block_index_run_length(const tidesdb_block_index_t *index, + const uint8_t *key, const size_t key_len, + const int64_t start_slot) +{ + if (!index || !key || start_slot < 0 || (uint32_t)start_slot >= index->count) return 0; + + uint8_t search_prefix[TDB_BLOCK_INDEX_PREFIX_MAX]; + const size_t copy_len = (key_len < index->prefix_len) ? key_len : index->prefix_len; + memcpy(search_prefix, key, copy_len); + if (copy_len < index->prefix_len) + { + memset(search_prefix + copy_len, 0, index->prefix_len - copy_len); + } + + uint32_t run = 0; + for (uint32_t s = (uint32_t)start_slot; s < index->count; s++) + { + const uint8_t *min_prefix = index->min_key_prefixes + (s * index->prefix_len); + int cmp_min; + if (index->comparator) + { + cmp_min = index->comparator(min_prefix, index->prefix_len, search_prefix, + index->prefix_len, index->comparator_ctx); + } + else + { + cmp_min = memcmp(min_prefix, search_prefix, index->prefix_len); + } + + /* min_prefix > search_prefix -- the key sorts before this block, so it + * cannot be in this block or any later one; the run ends here */ + if (cmp_min > 0) break; + run++; + } + + /* a gap (start_slot's min prefix already past the search prefix) still scans + * one block so the in-block search can report not found consistently */ + if (run == 0) run = 1; + return run; +} + +/** + * compact_block_index_free + * free a block index + * @param index block index to free + */ +static void compact_block_index_free(tidesdb_block_index_t *index) +{ + if (!index) return; + free(index->min_key_prefixes); + free(index->max_key_prefixes); + free(index->file_positions); + free(index); +} + +#ifdef TDB_ENABLE_READ_PROFILING + +/** + * tidesdb_get_read_stats + * get read statistics for the passed database + * @param db database to query + * @param stats pointer to read stats structure + * @return 0 on success, -1 on error + */ +int tidesdb_get_read_stats(tidesdb_t *db, tidesdb_read_stats_t *stats) +{ + if (!db || !stats) return TDB_ERR_INVALID_ARGS; + + stats->total_reads = atomic_load(&db->read_stats.total_reads); + stats->memtable_hits = atomic_load(&db->read_stats.memtable_hits); + stats->immutable_hits = atomic_load(&db->read_stats.immutable_hits); + stats->sstable_hits = atomic_load(&db->read_stats.sstable_hits); + stats->levels_searched = atomic_load(&db->read_stats.levels_searched); + stats->sstables_checked = atomic_load(&db->read_stats.sstables_checked); + stats->bloom_checks = atomic_load(&db->read_stats.bloom_checks); + stats->bloom_hits = atomic_load(&db->read_stats.bloom_hits); + stats->blocks_read = atomic_load(&db->read_stats.blocks_read); + stats->cache_block_hits = atomic_load(&db->read_stats.cache_block_hits); + stats->cache_block_misses = atomic_load(&db->read_stats.cache_block_misses); + stats->disk_reads = atomic_load(&db->read_stats.disk_reads); + + return TDB_SUCCESS; +} + +/** + * tidesdb_print_read_stats + * print read statistics for passed database + * @param db database to query + */ +void tidesdb_print_read_stats(tidesdb_t *db) +{ + if (!db) return; + + tidesdb_read_stats_t stats; + tidesdb_get_read_stats(db, &stats); + + uint64_t total_block_accesses = stats.cache_block_hits + stats.cache_block_misses; + double cache_hit_rate = + total_block_accesses > 0 ? (100.0 * stats.cache_block_hits / total_block_accesses) : 0.0; + double bloom_hit_rate = + stats.bloom_checks > 0 ? (100.0 * stats.bloom_hits / stats.bloom_checks) : 0.0; + double avg_levels_per_read = + stats.total_reads > 0 ? ((double)stats.levels_searched / stats.total_reads) : 0.0; + double avg_sstables_per_read = + stats.total_reads > 0 ? ((double)stats.sstables_checked / stats.total_reads) : 0.0; + double avg_blocks_per_read = + stats.total_reads > 0 ? ((double)stats.blocks_read / stats.total_reads) : 0.0; + printf("\n*---------------------- TidesDB Read Profiling Stats ----------------------*\n"); + printf("Total Reads: %" PRIu64 "\n", stats.total_reads); + printf("\nRead Hit Location:\n"); + printf(" Memtable hits: %" PRIu64 " (%.1f%%)\n", stats.memtable_hits, + stats.total_reads > 0 ? 100.0 * stats.memtable_hits / stats.total_reads : 0.0); + printf(" Immutable hits: %" PRIu64 " (%.1f%%)\n", stats.immutable_hits, + stats.total_reads > 0 ? 100.0 * stats.immutable_hits / stats.total_reads : 0.0); + printf(" SSTable hits: %" PRIu64 " (%.1f%%)\n", stats.sstable_hits, + stats.total_reads > 0 ? 100.0 * stats.sstable_hits / stats.total_reads : 0.0); + printf("\nSSTable Search:\n"); + printf(" Levels searched: %" PRIu64 " (avg: %.2f per read)\n", stats.levels_searched, + avg_levels_per_read); + printf(" SSTables checked: %" PRIu64 " (avg: %.2f per read)\n", stats.sstables_checked, + avg_sstables_per_read); + printf(" Bloom checks: %" PRIu64 "\n", stats.bloom_checks); + printf(" Bloom hits: %" PRIu64 " (%.1f%%)\n", stats.bloom_hits, bloom_hit_rate); + printf("\nBlock-Level Cache:\n"); + printf(" Cache hits: %" PRIu64 "\n", stats.cache_block_hits); + printf(" Cache misses: %" PRIu64 "\n", stats.cache_block_misses); + printf(" Cache hit rate: %.1f%%\n", cache_hit_rate); + printf(" Blocks read: %" PRIu64 " (avg: %.2f per read)\n", stats.blocks_read, + avg_blocks_per_read); + printf(" Disk reads: %" PRIu64 "\n", stats.disk_reads); + + if (db->clock_cache) + { + clock_cache_stats_t cache_stats; + clock_cache_get_stats(db->clock_cache, &cache_stats); + printf("\nClock Cache Stats:\n"); + printf(" Total entries: %zu\n", cache_stats.total_entries); + printf(" Total bytes: %.2f MB\n", cache_stats.total_bytes / (1024.0 * 1024.0)); + printf(" Global hits: %" PRIu64 "\n", cache_stats.hits); + printf(" Global misses: %" PRIu64 "\n", cache_stats.misses); + printf(" Global hit rate: %.1f%%\n", cache_stats.hit_rate * 100.0); + } + printf("*--------------------------------------------------------------------------*\n\n"); +} + +/** + * tidesdb_reset_read_stats + * reset read statistics for the database + * @param db database to reset stats for + */ +void tidesdb_reset_read_stats(tidesdb_t *db) +{ + if (!db) return; + + atomic_store(&db->read_stats.total_reads, 0); + atomic_store(&db->read_stats.memtable_hits, 0); + atomic_store(&db->read_stats.immutable_hits, 0); + atomic_store(&db->read_stats.sstable_hits, 0); + atomic_store(&db->read_stats.levels_searched, 0); + atomic_store(&db->read_stats.sstables_checked, 0); + atomic_store(&db->read_stats.bloom_checks, 0); + atomic_store(&db->read_stats.bloom_hits, 0); + atomic_store(&db->read_stats.blocks_read, 0); + atomic_store(&db->read_stats.cache_block_hits, 0); + atomic_store(&db->read_stats.cache_block_misses, 0); + atomic_store(&db->read_stats.disk_reads, 0); +} +#endif + +int tidesdb_sync_wal(tidesdb_column_family_t *cf) +{ + if (!cf || !cf->db) return TDB_ERR_INVALID_ARGS; + + /* we load active memtable with refcount protection to safely access its WAL */ + tidesdb_memtable_t *mt = NULL; + if (!tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, &mt)) + { + /* the memtable was rotated, we must reload */ + if (!tidesdb_active_memtable_try_ref(&cf->active_mt_readers, &cf->active_memtable, &mt)) + { + return TDB_ERR_IO; + } + } + + int result = TDB_SUCCESS; + if (mt->wal) + { + if (block_manager_escalate_fsync(mt->wal) != 0) + { + result = TDB_ERR_IO; + } + } + + tidesdb_immutable_memtable_unref(mt); + return result; +} + +void tidesdb_free(void *ptr) +{ + if (!ptr) return; + free(ptr); +} diff --git a/storage/tidesdb/libtidesdb/src/tidesdb.h b/storage/tidesdb/libtidesdb/src/tidesdb.h new file mode 100644 index 0000000000000..3a539e6606157 --- /dev/null +++ b/storage/tidesdb/libtidesdb/src/tidesdb.h @@ -0,0 +1,1978 @@ +/** + * + * Copyright (C) TidesDB + * + * Original Author: Alex Gaetano Padula + * + * Licensed under the Mozilla Public License, v. 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.mozilla.org/en-US/MPL/2.0/ + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __TIDESDB_H__ +#define __TIDESDB_H__ + +#include "alloc.h" +#include "block_manager.h" +#include "bloom_filter.h" +#include "btree.h" +#include "clock_cache.h" +#include "compat.h" +#include "compress.h" +#include "ini.h" +#include "local_cache.h" +#include "manifest.h" +#include "objstore.h" +#include "queue.h" +#include "skip_list.h" + +/* logging levels for TDB_DEBUG_LOG */ +typedef enum +{ + TDB_LOG_DEBUG = 0, /* general debugging info (most verbose) */ + TDB_LOG_INFO = 1, /* informational messages */ + TDB_LOG_WARN = 2, /* warnings (e.g., "Retry attempt N"..) */ + TDB_LOG_ERROR = 3, /* errors (e.g., "Failed to open file", "Invalid checksum") */ + TDB_LOG_FATAL = 4, /* fatal errors (e.g., "Corruption detected", "Out of memory") */ + TDB_LOG_NONE = 99 /* disable all logging */ +} tidesdb_log_level_t; + +extern _Atomic(int) _tidesdb_log_level; /* minimum level to log (default is TDB_LOG_DEBUG); + * atomic -- the TDB_DEBUG_LOG macro gates on it + * lock-free while tidesdb_open may rewrite it */ +extern FILE *_tidesdb_log_file; /* log file pointer (NULL = stderr, non-NULL = file) */ +extern size_t _tidesdb_log_truncate; /* truncate log file at this size (0 = no truncation) */ +extern char _tidesdb_log_path[MAX_FILE_PATH_LENGTH]; /* path to log file for truncation */ + +/** + * tidesdb_log_write + * writes a log message to the configured log output (stderr or log file) + * @param level log level (TDB_LOG_DEBUG, TDB_LOG_INFO, TDB_LOG_WARN, TDB_LOG_ERROR, TDB_LOG_FATAL) + * @param file source file name (typically __FILE__) + * @param line source line number (typically __LINE__) + * @param fmt printf-style format string + * @param ... format arguments + */ +void tidesdb_log_write(int level, const char *file, int line, const char *fmt, ...); + +#define TDB_DEBUG_LOG(level, fmt, ...) \ + do \ + { \ + if ((level) >= _tidesdb_log_level && _tidesdb_log_level != TDB_LOG_NONE) \ + tidesdb_log_write((level), __FILE__, __LINE__, fmt, ##__VA_ARGS__); \ + } while (0) + +/** + * tidesdb_isolation_level_t + * isolation levels for transactions + * + * tdb_isolation_read_uncommitted (0) + * -- sees all versions including uncommitted changes (dirty reads) + * -- no snapshot isolation, uses uint64_max to bypass filtering + * -- fastest but allows dirty reads, non-repeatable reads, and phantom reads + * -- no conflict detection + * -- good for analytics on non-critical data where performance is paramount + * + * tdb_isolation_read_committed (1) + * -- refreshes snapshot on each read operation + * -- prevents dirty reads by only seeing committed data + * -- allows non-repeatable reads (same key may return different values) + * -- allows phantom reads (range queries may see different rows) + * -- no conflict detection + * -- good default for most applications, good balance of consistency and performance + * + * tdb_isolation_repeatable_read (2) + * -- consistent snapshot taken at transaction start + * -- prevents dirty reads and non-repeatable reads for point reads + * -- allows phantom reads (new rows can appear in range queries) + * -- uses read-write conflict detection only + * -- aborts if a read key was modified by another transaction + * -- good for applications requiring consistent reads but tolerating some write conflicts + * + * tdb_isolation_snapshot (3) + * -- consistent snapshot with first-committer-wins semantics + * -- prevents dirty reads and non-repeatable reads + * -- prevents lost updates via write-write conflict detection + * -- allows write skew anomaly (two txns read overlapping data and write disjoint sets) + * -- no read set tracking, only write-write conflict detection + * -- aborts only on write-write conflict + * -- good for financial transactions, inventory management + * + * tdb_isolation_serializable (4) + * -- full serializability using ssi (serializable snapshot isolation) + * -- prevents dirty reads, non-repeatable reads, and phantom reads + * -- uses read-write, write-write, and rw-antidependency conflict detection + * -- tracks active transactions for dangerous structure detection + * -- highest isolation but lowest concurrency + * -- great for critical transactions requiring full acid guarantees + */ +typedef enum +{ + TDB_ISOLATION_READ_UNCOMMITTED = 0, + TDB_ISOLATION_READ_COMMITTED = 1, + TDB_ISOLATION_REPEATABLE_READ = 2, + TDB_ISOLATION_SNAPSHOT = 3, + TDB_ISOLATION_SERIALIZABLE = 4 +} tidesdb_isolation_level_t; + +/* error codes */ +#define TDB_SUCCESS 0 +#define TDB_ERR_MEMORY -1 +#define TDB_ERR_INVALID_ARGS -2 +#define TDB_ERR_NOT_FOUND -3 +#define TDB_ERR_IO -4 +#define TDB_ERR_CORRUPTION -5 +#define TDB_ERR_EXISTS -6 +#define TDB_ERR_CONFLICT -7 +#define TDB_ERR_TOO_LARGE -8 +#define TDB_ERR_MEMORY_LIMIT -9 +#define TDB_ERR_INVALID_DB -10 +#define TDB_ERR_UNKNOWN -11 +#define TDB_ERR_LOCKED -12 +#define TDB_ERR_READONLY -13 +/* system is at capacity and the operation gave up after the backpressure + * stall hit its no-progress budget. transient; callers should retry */ +#define TDB_ERR_BUSY -14 + +#ifdef TDB_ENABLE_READ_PROFILING +/** + * tidesdb_read_stats_t + * read profiling statistics (only available when TDB_ENABLE_READ_PROFILING is defined) + * @param total_reads total number of read operations + * @param memtable_hits reads satisfied from active memtable + * @param immutable_hits reads satisfied from immutable memtables + * @param sstable_hits reads satisfied from sstables on disk + * @param levels_searched total levels searched across all reads + * @param sstables_checked total sstables checked across all reads + * @param bloom_checks total bloom filter checks performed + * @param bloom_hits bloom filter checks that returned positive + * @param blocks_read total klog blocks read from disk or cache + * @param cache_block_hits block reads satisfied from block cache + * @param cache_block_misses block reads that missed the cache + * @param disk_reads total raw disk reads performed + */ +typedef struct +{ + _Atomic(uint64_t) total_reads; + _Atomic(uint64_t) memtable_hits; + _Atomic(uint64_t) immutable_hits; + _Atomic(uint64_t) sstable_hits; + _Atomic(uint64_t) levels_searched; + _Atomic(uint64_t) sstables_checked; + _Atomic(uint64_t) bloom_checks; + _Atomic(uint64_t) bloom_hits; + _Atomic(uint64_t) blocks_read; + _Atomic(uint64_t) cache_block_hits; + _Atomic(uint64_t) cache_block_misses; + _Atomic(uint64_t) disk_reads; +} tidesdb_read_stats_t; +#endif + +/* similar to relational database systems like oracle, where table and column names are limited to + * 128 characters */ +#define TDB_MAX_CF_NAME_LEN 128 + +/** + * tidesdb_sync_mode_t + * synchronization modes + */ +typedef enum +{ + TDB_SYNC_NONE, /* writes are not synced on every write, only once say sstable files are + completed */ + TDB_SYNC_FULL, /* writes are synced on every write, background and foreground wal and sstable + files */ + TDB_SYNC_INTERVAL, /* writes are synced on every write (background) all files, + foreground wal syncs are done through sync worker */ +} tidesdb_sync_mode_t; + +/* default configuration values */ +#define TDB_DEFAULT_WRITE_BUFFER_SIZE (64 * 1024 * 1024) +#define TDB_DEFAULT_LEVEL_SIZE_RATIO 10 +/* cf trees grows organically -- L = log_T(N/B). starts with one disk + * level and let add_level deepen it, rather than pre-allocating empty levels */ +#define TDB_DEFAULT_MIN_LEVELS 1 +/* spooky generalized Spooky sets the dividing level X to L-2; with + * X = num_active_levels - 1 - offset that means offset = 1 */ +#define TDB_DEFAULT_DIVIDING_LEVEL_OFFSET 1 +#define TDB_DEFAULT_COMPACTION_THREAD_POOL_SIZE 2 +#define TDB_DEFAULT_FLUSH_THREAD_POOL_SIZE 2 +/* pinned to the flush pool size tidesdb_open clamps max_concurrent_flushes to + * num_flush_threads and warns when they differ, so the canonical default open + * (default_config + open) must already agree or it warns on every startup */ +#define TDB_DEFAULT_MAX_CONCURRENT_FLUSHES TDB_DEFAULT_FLUSH_THREAD_POOL_SIZE +#define TDB_DEFAULT_BLOOM_FPR 0.01 +#define TDB_DEFAULT_KLOG_VALUE_THRESHOLD 512 +#define TDB_DEFAULT_INDEX_SAMPLE_RATIO 1 +#define TDB_DEFAULT_BLOCK_INDEX_PREFIX_LEN 16 +#define TDB_DEFAULT_MIN_DISK_SPACE (100 * 1024 * 1024) +#if defined(__OpenBSD__) +#define TDB_DEFAULT_MAX_OPEN_SSTABLES 64 /* x2 OpenBSD has lower default fd limits */ +#else +#define TDB_DEFAULT_MAX_OPEN_SSTABLES 256 /* x2 each sstable has 2 fds, so really 512 */ +#endif +#define TDB_DEFAULT_BLOCK_CACHE_SIZE (64 * 1024 * 1024) +#define TDB_DEFAULT_SYNC_INTERVAL_US 128000 +#define TDB_DEFAULT_LOG_FILE_TRUNCATION 24 * (1024 * 1024) + +#define TDB_SKIP_LIST_MAX_LEVEL 12 +#define TDB_SKIP_LIST_PROBABILITY 0.25f + +/* configuration limits */ +#define TDB_MAX_COMPARATOR_NAME 64 +#define TDB_MAX_COMPARATOR_CTX 256 + +/* file system permissions */ +#define TDB_DIR_PERMISSIONS 0755 + +/** + * tidesdb_comparator_fn + * comparator function type for custom key ordering + * @param key1 first key to compare + * @param key1_size size of first key in bytes + * @param key2 second key to compare + * @param key2_size size of second key in bytes + * @param ctx user-provided context pointer + * @return <0 if key1 < key2, 0 if equal, >0 if key1 > key2 + */ +typedef int (*tidesdb_comparator_fn)(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_commit_op_t + * represents a single operation in a committed transaction batch + * @param key pointer to the key data + * @param key_size size of the key in bytes + * @param value pointer to the value data (NULL for deletes) + * @param value_size size of the value in bytes (0 for deletes) + * @param ttl time-to-live in seconds (0 = no expiration) + * @param is_delete 1 if this is a delete operation, 0 for put + */ +typedef struct tidesdb_commit_op_t +{ + const uint8_t *key; + size_t key_size; + const uint8_t *value; + size_t value_size; + time_t ttl; + int is_delete; +} tidesdb_commit_op_t; + +/** + * tidesdb_commit_hook_fn + * callback invoked synchronously after a transaction commits to a column family + * @param ops array of commit operations + * @param num_ops number of operations in the array + * @param commit_seq commit sequence number + * @param ctx user-provided context + */ +typedef int (*tidesdb_commit_hook_fn)(const tidesdb_commit_op_t *ops, int num_ops, + uint64_t commit_seq, void *ctx); + +/* forward declarations for internal types */ +#define TDB_MAX_LEVELS 32 +#define TDB_IMM_SNAP_SLOTS 2 /* double-buffered RCU snapshot slots (one read, one rebuilt) */ + +typedef struct tidesdb_txn_op_t tidesdb_txn_op_t; +typedef struct tidesdb_merge_heap_t tidesdb_merge_heap_t; +typedef struct tidesdb_kv_pair_t tidesdb_kv_pair_t; +typedef struct tidesdb_commit_status_t tidesdb_commit_status_t; +typedef struct tidesdb_level_t tidesdb_level_t; +typedef struct tidesdb_sstable_t tidesdb_sstable_t; +typedef struct tidesdb_block_index_t tidesdb_block_index_t; +typedef struct tidesdb_memtable_t tidesdb_memtable_t; +typedef struct tidesdb_deferred_free_node_t tidesdb_deferred_free_node_t; +typedef struct tidesdb_t tidesdb_t; +typedef struct tidesdb_column_family_t tidesdb_column_family_t; + +/* lock-free immutable memtable snapshot slot + * part of a double-buffered RCU scheme; writers build in inactive slot, + * swap the active index, then wait for old-slot readers to drain. + * items is heap-allocated and grown lazily by the publisher to fit the queue + * depth, so the snapshot never silently truncates -- the immutable queue is + * bounded only by the configured l0_queue_stall_threshold, never by this array. + * @param items heap array of immutable memtables (capacity = cap) + * @param cap allocated capacity of items, in slots + * @param count number of valid items in the array + * @param readers number of active readers on this slot + */ +typedef struct +{ + tidesdb_memtable_t **items; + size_t cap; + _Atomic(size_t) count; + _Atomic(int32_t) readers; +} tidesdb_imm_snap_t; + +/* one column family's persisted unified memtable index + * mirrors a line of the UNIMAP file. the index prefixes every key the cf + * writes into the shared unified skip_list and wal, so it must stay stable + * across reopen -- it is keyed on the cf name, the only cf identity that + * survives a crash + * @param name column family name + * @param index the unified_cf_index permanently assigned to that name + */ +typedef struct +{ + char name[TDB_MAX_CF_NAME_LEN]; + uint32_t index; +} tidesdb_unified_cf_index_entry_t; + +typedef struct tidesdb_txn_t tidesdb_txn_t; +typedef struct tidesdb_iter_t tidesdb_iter_t; +typedef struct tidesdb_stats_t tidesdb_stats_t; + +/** + * tidesdb_column_family_config_t + * configuration for a column family + * @param name column family name (set automatically when CF is created/loaded) + * @param write_buffer_size size of write buffer + * @param level_size_ratio ratio of level sizes + * @param min_levels minimum number of levels + * @param dividing_level_offset selects spooky's dividing level X via + * X = num_levels - 1 - offset (X clamped to >= 1). + * offset=0 means X=L-1 (the second-largest level) + * and gives the 2L-spooky variant from the paper + * with transient space-amp bounded by 1/T but the + * highest write-amp. offset=1 means X=L-2 and is + * the paper's recommended generalized tuning, + * trading some ingest throughput for noticeably + * lower compaction write-amp. higher offsets push + * X further up the tree, reducing write-amp again + * but multiplying the number of open files per + * spooky equation 12. default is 1 (X=L-2, the paper's + * generalized tuning, per TDB_DEFAULT_DIVIDING_LEVEL_OFFSET); + * set to 0 (X=L-1) to favor ingest throughput at higher + * write-amp. + * @param klog_value_threshold threshold for klog value + * @param compression_algorithm compression algorithm + * @param enable_bloom_filter enable bloom filter + * @param bloom_fpr bloom filter false positive rate + * @param enable_block_indexes enable block indexes + * @param index_sample_ratio index sample ratio + * @param block_index_prefix_len block index prefix length + * @param sync_mode sync mode + * @param sync_interval_us sync interval in microseconds + * @param comparator_name name of comparator + * @param comparator_ctx_str comparator context string + * @param comparator_fn_cached cached comparator function + * @param comparator_ctx_cached cached comparator context + * @param skip_list_max_level skip list max level + * @param skip_list_probability skip list probability + * @param default_isolation_level default isolation level + * @param min_disk_space minimum free disk space required (bytes) + * @param l1_file_count_trigger trigger for L1 file count, utilized for compaction triggering + * @param l0_queue_stall_threshold threshold for L0 queue stall, utilized for backpressure + * @param tombstone_density_trigger ratio in [0.0, 1.0] above which any single sstable's + * tombstone density (tombstone_count / num_entries) escalates + * compaction priority; 0.0 disables the check (default). + * sstables with fewer than tombstone_density_min_entries are + * ignored to prevent tiny-sstable noise. + * @param tombstone_density_min_entries minimum entry count for an sstable to be considered by + * the density trigger; 0 falls back to the default + * @param use_btree use btree for klog, faster reads depending on workload + * @param commit_hook_fn optional commit hook callback (NULL = disabled, runtime-only) + * @param commit_hook_ctx optional user context passed to commit hook (runtime-only) + * @param object_target_file_size reserved for API compatibility, not used (file_max is derived from + * level geometry per spooky algorithm 2) + * @param object_lazy_compaction lazy compaction flag (1 = less aggressive, 0 = aggressive) + * @param object_prefetch_compaction prefetch compaction flag (1 = download all inputs before merge, + * 0 = stream) + */ +typedef struct tidesdb_column_family_config_t +{ + char name[TDB_MAX_CF_NAME_LEN]; + size_t write_buffer_size; + size_t level_size_ratio; + int min_levels; + int dividing_level_offset; + size_t klog_value_threshold; + compression_algorithm compression_algorithm; + int enable_bloom_filter; + double bloom_fpr; + int enable_block_indexes; + int index_sample_ratio; + int block_index_prefix_len; + int sync_mode; + uint64_t sync_interval_us; + char comparator_name[TDB_MAX_COMPARATOR_NAME]; + char comparator_ctx_str[TDB_MAX_COMPARATOR_CTX]; + skip_list_comparator_fn comparator_fn_cached; + void *comparator_ctx_cached; + int skip_list_max_level; + float skip_list_probability; + tidesdb_isolation_level_t default_isolation_level; + uint64_t min_disk_space; + int l1_file_count_trigger; + int l0_queue_stall_threshold; + double tombstone_density_trigger; + uint64_t tombstone_density_min_entries; + int use_btree; + tidesdb_commit_hook_fn commit_hook_fn; + void *commit_hook_ctx; + size_t object_target_file_size; /* reserved, not used */ + int object_lazy_compaction; + int object_prefetch_compaction; +} tidesdb_column_family_config_t; + +/** + * tidesdb_comparator_entry_t + * comparator registry entry + * @param name unique name for the comparator + * @param fn comparator function pointer + * @param ctx_str optional context string (for serialization) + * @param ctx runtime context pointer (reconstructed from ctx_str or set at registration) + */ +typedef struct tidesdb_comparator_entry_t +{ + char name[TDB_MAX_COMPARATOR_NAME]; + tidesdb_comparator_fn fn; + char ctx_str[TDB_MAX_COMPARATOR_CTX]; + void *ctx; +} tidesdb_comparator_entry_t; + +/** + * tidesdb_config_t + * configuration for the database + * @param db_path path to the database + * @param num_flush_threads number of flush threads + * @param num_compaction_threads number of compaction threads + * @param log_level minimum log level to display (TDB_LOG_DEBUG, TDB_LOG_INFO, TDB_LOG_WARN, + * TDB_LOG_ERROR, TDB_LOG_FATAL, TDB_LOG_NONE) + * @param block_cache_size size of clock cache for hot sstable blocks + * @param max_open_sstables maximum number of open sstables + * @param log_to_file flag to determine if debug logging should be written to a file + * @param log_truncation_at size in bytes at which to truncate the log file, 0 = no truncation + * @param max_memory_usage maximum memory usage for the database + * @param unified_memtable flag to determine if unified memtable should be used + * @param unified_memtable_write_buffer_size write buffer size for unified memtable (0 = auto) + * @param unified_memtable_skip_list_max_level skip list max level for unified memtable (0 = default + * 12) + * @param unified_memtable_skip_list_probability skip list probability (0 = default 0.25) + * @param unified_memtable_sync_mode sync mode for unified WAL (default TDB_SYNC_NONE) + * @param unified_memtable_sync_interval_us sync interval for unified WAL (0 = default) + * @param object_store object store instance (NULL = local only, default) + * @param object_store_config object store configuration (NULL = use defaults) + * @param max_concurrent_flushes global semaphore on the number of in-flight memtable flushes + * across all column families. bounds total transient memory and + * work-queue depth when many column families flush at once. + * pinned 1:1 to num_flush_threads at open -- a higher cap is + * meaningless because the pool size is the upper bound, a lower + * cap leaves workers idle. 0 means "match num_flush_threads", + * any other mismatch is corrected with a warning. + */ +typedef struct tidesdb_config_t +{ + char *db_path; + int num_flush_threads; + int num_compaction_threads; + tidesdb_log_level_t log_level; + size_t block_cache_size; + size_t max_open_sstables; + int log_to_file; + size_t log_truncation_at; + size_t max_memory_usage; + int unified_memtable; + size_t unified_memtable_write_buffer_size; + int unified_memtable_skip_list_max_level; + float unified_memtable_skip_list_probability; + int unified_memtable_sync_mode; + uint64_t unified_memtable_sync_interval_us; + tidesdb_objstore_t *object_store; + tidesdb_objstore_config_t *object_store_config; + int max_concurrent_flushes; +} tidesdb_config_t; + +/** + * tidesdb_memtable_t + * pairs a skip list and WAL together for better isolation and rotation + * @param skip_list the skip list data structure + * @param wal associated write-ahead log + * @param id unique identifier for this memtable + * @param generation generation counter for memtable rotation + * @param refcount reference count for safe concurrent access + * @param writers count of commit-path writers actively mutating the WAL and skip list + * @param flushed flag indicating if memtable has been flushed to disk + */ +struct tidesdb_memtable_t +{ + skip_list_t *skip_list; + /* _Atomic -- a flush worker closes a rotated memtable's wal and clears this + * while the reaper and sync worker may still read it on the active one */ + _Atomic(block_manager_t *) wal; + uint64_t id; + uint64_t generation; + _Atomic(int) refcount; + _Atomic(int) writers; + _Atomic(int) flushed; +}; + +/** + * tidesdb_column_family_t + * a column family is an independent key-value storage with its own config, memtables, WALs, etc. + * @param name name of column family + * @param directory directory for column family + * @param config column family configuration + * @param active_memtable active memtable (paired skip list and WAL) + * @param immutable_memtables queue of immutable memtables being flushed + * @param pending_commits count of in-flight commits + * @param levels fixed array of disk levels + * @param num_active_levels number of currently active disk levels + * @param next_sstable_id next sstable id + * @param sstable_layout_version monotonic version for sstable layout changes + * @param is_compacting atomic flag indicating compaction is queued + * @param is_flushing atomic flag indicating flush is queued + * @param flush_pending_count per-CF count of queued + in-flight flush work items + * @param flush_deferred flag set when a flush was skipped at the global concurrent-flush cap + * @param compaction_pending_count per-CF count of queued + in-flight compaction work items + * @param compaction_armed flag set when an enqueue was skipped because is_compacting was 1; the + * worker drains this when its current job ends and self-enqueues a follow-up + * @param immutable_cleanup_counter counter for batched immutable cleanup + * @param marked_for_deletion flag indicating column family is marked for deletion + * @param manifest manifest for column family + * @param db parent database reference + * @param imm_snaps double-buffered lock-free immutable memtable snapshot slots + * @param imm_snap_active index (0 or 1) of the currently active snapshot slot + * @param imm_snap_publish_lock serializes concurrent snapshot publishers + * @param unified_cf_index unified memtable column family index (4-byte big-endian prefix) + */ +struct tidesdb_column_family_t +{ + char *name; + char *directory; + tidesdb_column_family_config_t config; + _Atomic(tidesdb_memtable_t *) active_memtable; + queue_t *immutable_memtables; + _Atomic(uint64_t) pending_commits; + tidesdb_level_t *levels[TDB_MAX_LEVELS]; + _Atomic(int) num_active_levels; + _Atomic(uint64_t) next_sstable_id; + _Atomic(uint64_t) sstable_layout_version; + _Atomic(int) is_compacting; + _Atomic(int) is_flushing; + _Atomic(int) flush_pending_count; + _Atomic(int) flush_deferred; + _Atomic(int) compaction_pending_count; + _Atomic(int) compaction_armed; + _Atomic(int) immutable_cleanup_counter; + _Atomic(int) marked_for_deletion; + tidesdb_manifest_t *manifest; + tidesdb_t *db; + + /* lock-free immutable memtable snapshot (double-buffered RCU) + * readers acquire active slot, use items, release when done + * writers rebuild in inactive slot, swap active, wait for old readers */ + tidesdb_imm_snap_t imm_snaps[TDB_IMM_SNAP_SLOTS]; + _Atomic(int) imm_snap_active; /* 0 or 1, index of current snapshot */ + + /* publishers rebuild the inactive slot then swap -- the RCU design tolerates + * many readers but only one writer, so concurrent publishers (flush worker + * cleanup vs compaction-triggered flush) must serialize on this lock */ + pthread_mutex_t imm_snap_publish_lock; + + /* a single compaction round (serialized per CF by is_compacting) may run its + * partition sub-merges across multiple sub-compaction threads; this serializes the + * per-partition commit section (level add + manifest commit + layout bump) so the + * heavy merge work parallelizes while shared-state mutation stays single-threaded */ + pthread_mutex_t compaction_commit_lock; + + /* read-side epoch for the active_memtable slot. a reader bumps this before + * loading active_memtable + try_ref'ing the loaded pointer, drops it once + * try_ref has finished (success means refcount is now pinned, failure means + * we never touched the struct after the cas). the immutable cleanup loop + * drains this counter to 0 before free()ing a memtable struct so a reader + * holding a stale active_memtable pointer cannot UAF on try_ref's refcount + * read. mirrors imm_snap_t.readers but for the direct-active read path */ + _Atomic(int) active_mt_readers; + + /* unified memtable mode -- 4-byte big-endian CF prefix for keys in the shared skip list */ + uint32_t unified_cf_index; + + /* last-emit timestamps (seconds) for throttled backpressure warnings -- see tdb_log_throttle. + * zero-initialized by calloc, so the first event in each category logs immediately. */ + _Atomic(time_t) last_ceiling_stall_log_sec; + _Atomic(time_t) last_imm_critical_log_sec; + _Atomic(time_t) last_backpressure_log_sec; +}; + +/** + * tidesdb_sstable_t + * an immutable sorted string table on disk + * consists of two files a .klog (keys + metadata) and .vlog (large values) + * @param id unique identifier + * @param klog_path path to .klog file + * @param klog_filename cached pointer into klog_path past the last path separator + * @param vlog_path path to .vlog file + * @param cf_name cached column family name for block cache lookups + * @param min_key minimum key in this sstable + * @param min_key_size size of minimum key + * @param max_key maximum key in this sstable + * @param max_key_size size of maximum key + * @param num_entries total number of keys + * @param tombstone_count count of tombstone entries (TDB_KV_FLAG_TOMBSTONE) in this sstable. + * TDB_TOMBSTONE_COUNT_UNKNOWN means a legacy footer pre-dating the field. + * @param num_klog_blocks number of blocks in klog + * @param num_vlog_blocks number of blocks in vlog + * @param klog_data_end_offset offset where data ends in klog (before footer) + * @param klog_size total size of klog file + * @param vlog_size total size of vlog file + * @param max_seq maximum sequence number in this sstable + * @param bloom_filter bloom filter for key existence checks + * @param block_indexes block indexes for fast key lookup + * @param refcount reference count for safe concurrent access + * @param klog_bm klog block manager + * @param vlog_bm vlog block manager + * @param config column family configuration + * @param marked_for_deletion flag indicating sstable is marked for deletion + * @param last_access_time last access time for lru eviction + * @param db database handle (for resolving comparators from registry) + * @param use_btree flag indicating sstable uses btree format + * @param btree_root_offset root node offset for btree + * @param btree_first_leaf first leaf offset for btree forward iteration + * @param btree_last_leaf last leaf offset for btree backward iteration + * @param btree_node_count total number of nodes in btree + * @param btree_height height of btree + * @param cached_comparator_fn cached comparator function for fast iteration + * @param cached_comparator_ctx cached comparator context for fast iteration + * @param is_reverse flag indicating sstable is reverse sorted + * @param cache_key_prefix globally unique prefix for btree node cache keys + */ +struct tidesdb_sstable_t +{ + uint64_t id; + char *klog_path; + const char *klog_filename; + char *vlog_path; + char cf_name[TDB_MAX_CF_NAME_LEN]; + uint8_t *min_key; + size_t min_key_size; + uint8_t *max_key; + size_t max_key_size; + uint64_t num_entries; + uint64_t tombstone_count; + uint64_t num_klog_blocks; + uint64_t num_vlog_blocks; + uint64_t klog_data_end_offset; + uint64_t klog_size; + uint64_t vlog_size; + uint64_t max_seq; + bloom_filter_t *bloom_filter; + tidesdb_block_index_t *block_indexes; + _Atomic(int) refcount; + /* opened lazily by tidesdb_sstable_ensure_open and published by CAS, so the + * pointers are _Atomic -- readers acquire-load them and so observe the fully + * initialized block_manager the opener built before the publishing CAS */ + _Atomic(block_manager_t *) klog_bm; + _Atomic(block_manager_t *) vlog_bm; + tidesdb_column_family_config_t *config; + _Atomic(int) marked_for_deletion; + _Atomic(time_t) last_access_time; + tidesdb_t *db; + int use_btree; + int64_t btree_root_offset; + int64_t btree_first_leaf; + int64_t btree_last_leaf; + uint64_t btree_node_count; + uint32_t btree_height; + skip_list_comparator_fn cached_comparator_fn; + void *cached_comparator_ctx; + int is_reverse; + uint64_t cache_key_prefix; + /* chunked footer aux blobs -- when a bloom filter or block index footer blob + * exceeds the single-block chunk size it is written as multiple consecutive + * blocks and located by explicit offset+size instead of trailing-block + * navigation. aux_chunked is set (and the offsets persisted in metadata) only + * for such sstables; legacy/small sstables leave it 0 and use the original + * trailing-block read path. */ + int aux_chunked; + uint64_t bloom_blob_offset; + uint64_t bloom_blob_size; + uint64_t index_blob_offset; + uint64_t index_blob_size; +}; + +/** + * tidesdb_level_t + * a level in the lsm tree within a column family + * @param level_num level number + * @param capacity capacity of level in bytes + * @param current_size current size of level in bytes + * @param sstables array of sstable pointers (copy-on-write) + * @param num_sstables number of sstables in array + * @param sstables_capacity capacity of sstables array + * @param file_boundaries file boundaries for partitioning + * @param boundary_sizes sizes of boundary keys + * @param num_boundaries number of boundaries + * @param retired_sstables_arr array of retired sstables (mainly TOCTOU protection) + * @param array_readers count of concurrent readers accessing sstable array + */ +struct tidesdb_level_t +{ + int level_num; + _Atomic(size_t) capacity; + _Atomic(size_t) current_size; + _Atomic(tidesdb_sstable_t **) sstables; + _Atomic(int) num_sstables; + _Atomic(int) sstables_capacity; + _Atomic(uint8_t **) file_boundaries; + _Atomic(size_t *) boundary_sizes; + _Atomic(int) num_boundaries; + _Atomic(tidesdb_sstable_t **) retired_sstables_arr; + _Atomic(int) array_readers; +}; + +/** + * tidesdb_t + * main database handle + * @param db_path path to database directory + * @param config database configuration + * @param column_families array of column families + * @param num_column_families number of column families + * @param cf_capacity capacity of column families array + * @param is_open atomic flag indicating database is fully open and ready for operations + * @param is_recovering flag to determine if system is recovering + * @param comparators atomic pointer to comparators array (lock-free COW) + * @param num_comparators atomic count of registered comparators + * @param comparators_capacity atomic capacity of comparators array + * @param flush_threads array of flush threads + * @param flush_queue queue of flush work items + * @param compaction_threads array of compaction threads + * @param compaction_queue queue of compaction work items + * @param sync_thread background thread for interval syncing + * @param sync_thread_active atomic flag indicating if sync thread is active + * @param sync_thread_mutex mutex for sync thread + * @param sync_thread_cond condition variable for sync thread + * @param reaper_thread background thread for housekeeping + * @param reaper_active atomic flag indicating if reaper thread is active + * @param reaper_thread_mutex mutex for reaper thread + * @param reaper_thread_cond condition variable for reaper thread + * @param clock_cache clock cache for hot sstable blocks + * @param btree_node_cache clock cache for hot btree nodes, created lazily on the + * first btree column family so a database with no btree + * column family does not pay for it + * @param btree_cache_lock guards the one time lazy creation of btree_node_cache + * @param resolved_block_cache_size block cache size after clamping, reused when + * btree_node_cache is created lazily + * @param num_open_sstables global counter for open sstables + * @param next_txn_id global transaction id counter + * @param global_seq global sequence counter for snapshots and commits + * @param commit_status tracks which sequences are committed + * @param active_txns_lock rwlock for active transactions list + * @param active_txns array of active serializable transactions + * @param num_active_txns number of active transactions + * @param active_txns_capacity capacity of active transactions array + * @param cached_available_disk_space cached available disk space in bytes + * @param last_disk_space_check timestamp of last disk space check + * @param cached_current_time cached current time updated by reaper thread to avoid syscalls + * @param available_memory available system memory in bytes + * @param total_memory total system memory in bytes + * @param resolved_memory_limit resolved global memory limit in bytes + * @param cached_memtable_bytes cached total memtable + cache memory (updated by reaper) + * @param sstable_aux_memory_bytes running total of bloom filter + block index + * memory across every sstable currently in a + * level, maintained at level add and remove so + * the reaper does not rescan every sstable + * @param memory_pressure_level cached pressure level 0=normal 1=elevated 2=high 3=critical + * @param txn_memory_bytes bytes held by in-flight transactions + * @param flush_pending_count number of pending flush operations (queued + in-flight) + * @param active_flushes global semaphore counter for in-flight flushes across all column + * families. capped by config.max_concurrent_flushes. + * @param flush_heartbeat monotonic counter bumped by flush workers as they make progress; + * backpressure reads it to distinguish a slow flush from a wedged one + * @param os_check_counter counter for periodic os-level memory checks + * @param cf_list_lock rwlock for cf list modifications + * @param deferred_free_list lock-free singly-linked list of deferred free nodes for retired arrays + * @param lock_fd file descriptor for lock file + * @param log_file file descriptor for log file + * @param read_stats read profiling statistics (only when TDB_ENABLE_READ_PROFILING is defined) + * @param object_store active object store connector (NULL = local only) + * @param local_cache local file cache manager for object store mode + * @param upload_threads background upload thread pool for async sstable uploads + * @param num_upload_threads number of upload threads + * @param upload_queue queue of upload jobs (tdb_upload_job_t) + * @param last_uploaded_gen highest WAL generation confirmed uploaded to object store + * @param total_uploads lifetime count of objects uploaded to object store + * @param total_upload_failures lifetime count of permanently failed uploads (after all retries) + * @param replica_mode 1 if running as read-only replica, 0 if primary + * @param replica_sync_thread_active 1 while the dedicated replica sync thread runs + */ +struct tidesdb_t +{ + char *db_path; + tidesdb_config_t config; + tidesdb_column_family_t **column_families; + /* _Atomic -- written under cf_list_lock on cf create/drop but read + * lock-free by tdb_cf_effective_stall on the backpressure hot path */ + _Atomic(int) num_column_families; + int cf_capacity; + _Atomic(int) is_open; + _Atomic(int) is_recovering; + /* set by tidesdb_cancel_background_work -- when non-zero, in-flight compactions + * bail at their next checkpoint and queued compaction work items are skipped. + * compaction-only: flushes are unaffected so durability is preserved. sticky for + * the db session, reset to 0 on open. */ + _Atomic(int) cancel_compaction; + _Atomic(tidesdb_comparator_entry_t *) comparators; + _Atomic(int) num_comparators; + _Atomic(int) comparators_capacity; + pthread_t *flush_threads; + queue_t *flush_queue; + pthread_t *compaction_threads; + queue_t *compaction_queue; + /* budget of ephemeral sub-compaction helper threads a compaction round may spawn, + * initialized to num_compaction_threads at open. bounds total concurrent sub-merge + * threads across all CFs so parallel compaction never oversubscribes the pool. */ + _Atomic(int) compaction_helper_budget; + pthread_t sync_thread; + _Atomic(int) sync_thread_active; + pthread_mutex_t sync_thread_mutex; + pthread_cond_t sync_thread_cond; + pthread_t reaper_thread; + _Atomic(int) reaper_active; + pthread_mutex_t reaper_thread_mutex; + pthread_cond_t reaper_thread_cond; + clock_cache_t *clock_cache; + /* created lazily after worker threads are running, so the pointer is + * _Atomic -- btree_cache_lock still serializes the one-time creation */ + _Atomic(clock_cache_t *) btree_node_cache; + pthread_mutex_t btree_cache_lock; + size_t resolved_block_cache_size; + _Atomic(int) num_open_sstables; + /* last-emit timestamp (seconds) for the throttled open-failure (EMFILE) diagnostic, so a + * descriptor-exhaustion storm logs one legible line per second instead of flooding */ + _Atomic(time_t) last_open_fail_log_sec; + _Atomic(uint64_t) next_txn_id; + _Atomic(uint64_t) global_seq; + tidesdb_commit_status_t *commit_status; + pthread_rwlock_t active_txns_lock; + tidesdb_txn_t **active_txns; + int num_active_txns; + int active_txns_capacity; + _Atomic(uint64_t) cached_available_disk_space; + _Atomic(time_t) last_disk_space_check; + _Atomic(time_t) cached_current_time; + uint64_t available_memory; + uint64_t total_memory; + _Atomic(size_t) resolved_memory_limit; + _Atomic(int64_t) cached_memtable_bytes; + _Atomic(int64_t) sstable_aux_memory_bytes; + _Atomic(int64_t) txn_memory_bytes; + _Atomic(int) memory_pressure_level; + _Atomic(int) flush_pending_count; + _Atomic(int) active_flushes; + _Atomic(uint64_t) flush_heartbeat; + int os_check_counter; + pthread_rwlock_t cf_list_lock; + _Atomic(tidesdb_deferred_free_node_t *) deferred_free_list; + int lock_fd; + FILE *log_file; +#ifdef TDB_ENABLE_READ_PROFILING + tidesdb_read_stats_t read_stats; +#endif + + /* unified memtable mode -- single skip_list + single WAL for all CFs */ + struct + { + int enabled; + _Atomic(tidesdb_memtable_t *) active; + /* read-side epoch for the unified active slot. see the analogous + * cf->active_mt_readers field for the protocol */ + _Atomic(int) active_mt_readers; + queue_t *immutables; + _Atomic(int) is_flushing; + _Atomic(int) immutable_cleanup_counter; + size_t write_buffer_size; + _Atomic(uint32_t) next_cf_index; + _Atomic(uint64_t) wal_generation; + tidesdb_unified_cf_index_entry_t *cf_index_map; /* name -> index, mirrors UNIMAP file */ + int cf_index_map_count; + int cf_index_map_capacity; + pthread_mutex_t cf_index_map_lock; + pthread_mutex_t wal_group_sync_lock; /* coordinates group-commit fsync on the unified WAL */ + pthread_cond_t wal_group_sync_cond; + /* last-emit timestamp (seconds) for the throttled unified ceiling-stall warning */ + _Atomic(time_t) last_ceiling_stall_log_sec; + } unified_mt; + + /* object store mode runtime state */ + tidesdb_objstore_t *object_store; /* active connector (NULL = local only) */ + tdb_local_cache_t *local_cache; /* local file cache manager */ + pthread_t *upload_threads; /* background upload thread pool */ + int num_upload_threads; /* number of upload threads */ + queue_t *upload_queue; /* queue of tdb_upload_job_t */ + _Atomic(uint64_t) last_uploaded_gen; /* highest WAL gen confirmed uploaded */ + _Atomic(uint64_t) total_uploads; /* lifetime upload count */ + _Atomic(uint64_t) total_upload_failures; /* lifetime failed upload count */ + _Atomic(uint64_t) last_wal_sync_size; /* WAL file size at last object store sync; + * _Atomic -- reaper writes it, open seeds it */ + + /* replica mode runtime state */ + _Atomic(int) replica_mode; /* 1 = read-only replica, 0 = primary */ + pthread_t replica_sync_thread; /* dedicated replica MANIFEST/WAL sync thread */ + _Atomic(int) replica_sync_thread_active; /* 1 while the replica sync thread runs */ + + /* compaction pause gate -- tidesdb_backup holds this across its file copy + * so the copy cannot race a compaction rewriting the manifest + sstable set */ + pthread_mutex_t compaction_gate_lock; + int compaction_paused; /* guarded by compaction_gate_lock */ + _Atomic(int) active_compactions; /* compactions past the gate, in flight */ +}; + +/** + * tidesdb_txn_t + * transaction handle for batched operations with acid guarantees + * + * supports multiple isolation levels: + * -- read_uncommitted sees all versions including uncommitted (dirty reads allowed) + * -- read_committed refreshes snapshot on each read (prevents dirty reads) + * -- repeatable_read consistent snapshot, read-write conflict detection + * -- snapshot consistent snapshot, write-write conflict detection only + * -- serializable full ssi with dangerous structure detection (prevents all anomalies) + * + * snapshot isolation semantics: + * -- snapshot captured at begin (all committed txns with seq <= snapshot_seq are visible) + * -- conflict detection at commit (isolation level dependent) + * -- commit sequence acquired after conflict detection + * -- no retries -- conflicts cause immediate abort + * -- works across multiple column families + * + * @param db database handle + * @param txn_id transaction id + * @param snapshot_seq snapshot sequence captured at begin + * @param commit_seq commit sequence (0 until commit) + * @param ops array of operations + * @param num_ops number of operations + * @param ops_capacity capacity of operations array + * @param read_keys array of read keys for conflict detection + * @param read_key_sizes array of read key sizes + * @param read_seqs array of read sequence numbers + * @param read_cfs array of column families for each read key + * @param read_set_count number of read keys + * @param read_set_capacity capacity of read keys array + * @param read_key_arenas array of read key arenas + * @param read_key_arena_count number of read key arenas + * @param read_key_arena_used bytes used in current read key arena + * @param write_set_hash hash table for O(1) write set lookup (NULL if num_ops < + * TDB_TXN_WRITE_HASH_THRESHOLD) + * @param read_set_hash hash table for O(1) read set lookup (NULL if read_set_count < + * TDB_TXN_READ_HASH_THRESHOLD) + * @param cfs array of column families involved in transaction + * @param num_cfs number of column families + * @param cf_capacity capacity of column families array + * @param last_cf cached last-used column family for O(1) single-CF lookup + * @param last_cf_index cached index of last-used column family + * @param savepoints array of savepoint transaction states + * @param savepoint_names array of savepoint names + * @param num_savepoints number of savepoints + * @param savepoints_capacity capacity of savepoints array + * @param is_committed flag indicating if transaction is committed + * @param is_aborted flag indicating if transaction is aborted + * @param isolation_level isolation level for this transaction + * @param has_rw_conflict_in flag indicating rw-conflict-in (another txn read our writes) + * @param has_rw_conflict_out flag indicating rw-conflict-out (we read another txn's writes) + * @param mem_bytes running total of this txn's op buffer + read-key arena bytes (owned by the + * committing thread, so plain non-atomic accounting) + * @param mem_published amount of mem_bytes already reflected in db->txn_memory_bytes; the delta + * is flushed to the global counter in threshold-sized batches + */ +struct tidesdb_txn_t +{ + tidesdb_t *db; + uint64_t txn_id; + uint64_t snapshot_seq; + uint64_t commit_seq; + tidesdb_txn_op_t *ops; + int num_ops; + int ops_capacity; + uint8_t **read_keys; + size_t *read_key_sizes; + uint64_t *read_seqs; + tidesdb_column_family_t **read_cfs; + int read_set_count; + int read_set_capacity; + uint8_t **read_key_arenas; + int read_key_arena_count; + size_t read_key_arena_used; + void *write_set_hash; + void *read_set_hash; + tidesdb_column_family_t **cfs; + int num_cfs; + int cf_capacity; + tidesdb_column_family_t *last_cf; + int last_cf_index; + int *savepoint_op_counts; + int *savepoint_cf_counts; + char **savepoint_names; + int num_savepoints; + int savepoints_capacity; + /* these flags are read cross-txn by tidesdb_txn_check_ssi_conflicts while + * the owning txn writes them on commit/abort, so they are _Atomic */ + _Atomic(int) is_committed; + _Atomic(int) is_aborted; + tidesdb_isolation_level_t isolation_level; + _Atomic(int) has_rw_conflict_in; + _Atomic(int) has_rw_conflict_out; + int64_t mem_bytes; + int64_t mem_published; +}; + +/** + * tidesdb_iter_t + * iterator for database + * @param cf column family (for single-cf iteration) + * @param txn transaction (for isolation and multi-cf iteration) + * @param heap merge heap + * @param current current key-value pair + * @param valid validity flag + * @param direction direction of iteration (1=forward, -n=backward) + * @param snapshot_time snapshot time for ttl checks + * @param cf_snapshot snapshot sequence for visibility checks + * @param cached_sources cached sst sources for reuse across seeks + * @param num_cached_sources number of cached sources + * @param cached_sources_capacity capacity of cached sources array + * @param cached_mt_sources cached memtable sources for reuse across seeks + * @param num_cached_mt_sources number of cached memtable sources + * @param temp_sources pre-allocated temporary source array for seek operations + * @param temp_sources_capacity capacity of temp_sources array + */ +struct tidesdb_iter_t +{ + tidesdb_column_family_t *cf; + tidesdb_txn_t *txn; + tidesdb_merge_heap_t *heap; + tidesdb_kv_pair_t *current; + int valid; + int direction; + time_t snapshot_time; + uint64_t cf_snapshot; + void **cached_sources; + int num_cached_sources; + int cached_sources_capacity; + void **cached_mt_sources; + int num_cached_mt_sources; + void **temp_sources; + int temp_sources_capacity; +}; + +/** + * tidesdb_stats_t + * statistics for database column family + * @param num_levels number of levels + * @param memtable_size size of memtable + * @param level_sizes sizes of each level + * @param level_num_sstables number of sstables in each level + * @param config column family configuration + * @param total_keys total number of keys across memtable and all sstables + * @param total_data_size total data size (klog + vlog) across all sstables + * @param avg_key_size average key size in bytes + * @param avg_value_size average value size in bytes + * @param level_key_counts number of keys per level + * @param read_amp read amplification (point lookup cost multiplier) + * @param hit_rate cache hit rate (0.0 if cache disabled) + * @param use_btree whether column family uses b+tree klog format + * @param btree_total_nodes total b+tree nodes across all sstables + * @param btree_max_height maximum tree height across all sstables + * @param btree_avg_height average tree height across all sstables + * @param total_tombstones sum of tombstone_count across every sstable in the cf + * @param tombstone_ratio total_tombstones / total_keys (0.0 if total_keys is 0) + * @param level_tombstone_counts tombstone count per level (parallels level_key_counts) + * @param max_sst_density worst per-sstable tombstone density observed in the cf + * @param max_sst_density_level 1-based level where max_sst_density was observed (0 if none) + */ +struct tidesdb_stats_t +{ + int num_levels; + size_t memtable_size; + size_t *level_sizes; + int *level_num_sstables; + tidesdb_column_family_config_t *config; + uint64_t total_keys; + uint64_t total_data_size; + double avg_key_size; + double avg_value_size; + uint64_t *level_key_counts; + double read_amp; + double hit_rate; + /* btree stats (only populated if use_btree=1) */ + int use_btree; + uint64_t btree_total_nodes; + uint32_t btree_max_height; + double btree_avg_height; + /* tombstone observability */ + uint64_t total_tombstones; + double tombstone_ratio; + uint64_t *level_tombstone_counts; + double max_sst_density; + int max_sst_density_level; +}; + +/** + * tidesdb_cache_stats_t + * statistics for database block cache + * @param enabled whether block cache is enabled + * @param total_entries total number of cached entries + * @param total_bytes total bytes used by cache + * @param hits cache hits + * @param misses cache misses + * @param hit_rate hit rate (hits / (hits + misses)) + * @param num_partitions number of cache partitions + */ +typedef struct tidesdb_cache_stats_t +{ + int enabled; + size_t total_entries; + size_t total_bytes; + uint64_t hits; + uint64_t misses; + double hit_rate; + size_t num_partitions; +} tidesdb_cache_stats_t; + +/** + * tidesdb_db_stats_t + * database-level statistics + * @param num_column_families number of column families + * @param total_memory system total memory + * @param available_memory system available memory at open + * @param resolved_memory_limit resolved memory limit + * @param memory_pressure_level current memory pressure level (0=normal, 1=elevated, 2=high, + * 3=critical) + * @param flush_pending_count number of pending flush operations (queued + in-flight) + * @param total_memtable_bytes total bytes in active memtables across all CFs + * @param total_immutable_count total immutable memtables across all CFs + * @param total_sstable_count total sstables across all CFs and levels + * @param total_data_size_bytes total data size across all CFs + * @param num_open_sstables number of currently open sstable file handles + * @param global_seq current global sequence number + * @param txn_memory_bytes bytes held by in-flight transactions + * @param compaction_queue_size number of pending compaction tasks + * @param flush_queue_size number of pending flush tasks in queue + * @param unified_memtable_enabled whether unified memtable mode is active + * @param unified_memtable_bytes bytes in unified active memtable + * @param unified_immutable_count number of unified immutable memtables + * @param unified_is_flushing whether unified memtable is currently flushing/rotating + * @param unified_next_cf_index next CF index to be assigned in unified mode + * @param unified_wal_generation current unified WAL generation counter + * @param object_store_enabled whether object store mode is active + * @param object_store_connector connector name ("s3", "gcs", "fs", etc.) + * @param local_cache_bytes_used current local file cache usage in bytes + * @param local_cache_bytes_max configured maximum local cache size in bytes + * @param local_cache_num_files number of files tracked in local cache + * @param last_uploaded_generation highest WAL generation confirmed uploaded + * @param upload_queue_depth number of pending upload jobs in the queue + * @param total_uploads lifetime count of objects uploaded to object store + * @param total_upload_failures lifetime count of permanently failed uploads (after all retries) + * @param replica_mode whether running in read-only replica mode + */ +typedef struct tidesdb_db_stats_t +{ + int num_column_families; + uint64_t total_memory; + uint64_t available_memory; + size_t resolved_memory_limit; + int memory_pressure_level; + int flush_pending_count; + int64_t total_memtable_bytes; + int total_immutable_count; + int total_sstable_count; + uint64_t total_data_size_bytes; + int num_open_sstables; + uint64_t global_seq; + int64_t txn_memory_bytes; + size_t compaction_queue_size; + size_t flush_queue_size; + int unified_memtable_enabled; + int64_t unified_memtable_bytes; + int unified_immutable_count; + int unified_is_flushing; + uint32_t unified_next_cf_index; + uint64_t unified_wal_generation; + int object_store_enabled; + const char *object_store_connector; + size_t local_cache_bytes_used; + size_t local_cache_bytes_max; + int local_cache_num_files; + uint64_t last_uploaded_generation; + size_t upload_queue_depth; + uint64_t total_uploads; + uint64_t total_upload_failures; + int replica_mode; +} tidesdb_db_stats_t; + +/** + * tidesdb_default_column_family_config + * @return default configuration for column family + */ +tidesdb_column_family_config_t tidesdb_default_column_family_config(void); + +/** + * tidesdb_default_config + * @return default configuration for a database + */ +tidesdb_config_t tidesdb_default_config(void); + +/** + * tidesdb_open + * opens an existing database or creates a new one + * @param config database configuration + * @param db output parameter for database handle + * @return 0 on success, -n on failure + */ +int tidesdb_open(const tidesdb_config_t *config, tidesdb_t **db); + +/** + * tidesdb_raise_open_file_limit + * raise this process's open-file ceiling toward `desired` descriptors so a database can keep more + * sstables open -- the engine sizes max_open_sstables to fit this at open time, so call it BEFORE + * tidesdb_open. an explicit, opt-in operator action: tidesdb never raises the limit itself. POSIX + * raises the RLIMIT_NOFILE soft limit toward the hard limit; Windows raises the CRT stdio cap + * (max 8192). a failed or partial raise is non-fatal -- the prior ceiling stands. + * @param desired target descriptor count; <= 0 just reports the current ceiling + * @return the open-file ceiling in effect after the attempt + */ +long tidesdb_raise_open_file_limit(long desired); + +/** + * tidesdb_register_comparator + * registers a custom comparator function + * @param db database handle + * @param name unique name for the comparator (max 63 chars) + * @param fn comparator function pointer + * @param ctx_str optional context string for serialization (can be NULL) + * @param ctx optional runtime context pointer (can be NULL) + * @return 0 on success, -n on failure (duplicate name, invalid args, etc.) + */ +int tidesdb_register_comparator(tidesdb_t *db, const char *name, skip_list_comparator_fn fn, + const char *ctx_str, void *ctx); + +/** + * tidesdb_get_comparator + * retrieves a registered comparator by name + * @param db database handle + * @param name comparator name + * @param fn output parameter for comparator function (can be NULL) + * @param ctx output parameter for runtime context pointer (can be NULL) + * @return 0 on success, -n if not found + */ +int tidesdb_get_comparator(tidesdb_t *db, const char *name, skip_list_comparator_fn *fn, + void **ctx); + +/** + * tidesdb_close + * closes a database + * @param db database handle + * @return 0 on success, -n on failure + */ +int tidesdb_close(tidesdb_t *db); + +/** + * tidesdb_promote_to_primary + * switch a read-only replica to primary mode. performs a final WAL replay + * and MANIFEST sync, then enables write acceptance. + * @param db database handle in replica mode + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS if not a replica + */ +int tidesdb_promote_to_primary(tidesdb_t *db); + +#ifdef TDB_ENABLE_READ_PROFILING +/** + * tidesdb_get_read_stats + * gets read profiling statistics + * @param db the database + * @param stats output statistics structure + * @return TDB_SUCCESS on success, error code on failure + */ +int tidesdb_get_read_stats(tidesdb_t *db, tidesdb_read_stats_t *stats); + +/** + * tidesdb_print_read_stats + * prints read profiling statistics to stdout + * @param db the database + */ +void tidesdb_print_read_stats(tidesdb_t *db); + +/** + * tidesdb_reset_read_stats + * resets read profiling statistics + * @param db the database + */ +void tidesdb_reset_read_stats(tidesdb_t *db); +#endif + +/** + * tidesdb_create_column_family + * creates a new column family with specified configuration + * @param db database handle + * @param name name of column family + * @param config configuration for column family + * @return 0 on success, -n on failure + */ +int tidesdb_create_column_family(tidesdb_t *db, const char *name, + const tidesdb_column_family_config_t *config); + +/** + * tidesdb_drop_column_family + * drops a column family + * @param db database handle + * @param name name of column family + * @return 0 on success, -n on failure + */ +int tidesdb_drop_column_family(tidesdb_t *db, const char *name); + +/** + * tidesdb_delete_column_family + * drops a column family passing pointer instead of string + * @param db database handle + * @param cf column family to drop + * @return 0 on success, -n on failure + */ +int tidesdb_delete_column_family(tidesdb_t *db, tidesdb_column_family_t *cf); + +/** + * tidesdb_rename_column_family + * renames a column family safely (flushes pending data first) + * @param db database handle + * @param old_name current name of column family + * @param new_name new name for column family + * @return 0 on success, -n on failure + */ +int tidesdb_rename_column_family(tidesdb_t *db, const char *old_name, const char *new_name); + +/** + * tidesdb_get_column_family + * gets a column family from a database + * @param db database handle + * @param name name of column family + * @return pointer to column family, NULL on failure + */ +tidesdb_column_family_t *tidesdb_get_column_family(tidesdb_t *db, const char *name); + +/** + * tidesdb_list_column_families + * lists all column families in requested database + * @param db database handle + * @param names pointer to array of column family names (caller must free each name and the array) + * @param count pointer to store the number of column families + * @return 0 on success, -n on failure + */ +int tidesdb_list_column_families(tidesdb_t *db, char ***names, int *count); + +/** + * tidesdb_txn_begin + * begins a transaction with default isolation level (READ_COMMITTED) + * @param db database handle + * @param txn pointer to transaction handle + * @return 0 on success, -n on failure + */ +int tidesdb_txn_begin(tidesdb_t *db, tidesdb_txn_t **txn); + +/** + * tidesdb_txn_begin_with_isolation + * begins a transaction with specified isolation level + * @param db database handle + * @param isolation isolation level + * @param txn pointer to transaction handle + * @return 0 on success, -n on failure + */ +int tidesdb_txn_begin_with_isolation(tidesdb_t *db, tidesdb_isolation_level_t isolation, + tidesdb_txn_t **txn); + +/** + * tidesdb_txn_put + * adds a write operation to a transaction + * @param txn transaction handle + * @param cf column family to put into + * @param key key to put + * @param key_size size of key + * @param value value to put + * @param value_size size of value + * @param ttl time-to-live for key-value pair + * @return 0 on success, -n on failure + */ +int tidesdb_txn_put(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size, const uint8_t *value, size_t value_size, time_t ttl); + +/** + * tidesdb_txn_get + * gets a value from a transaction + * @param txn transaction handle + * @param cf column family to get from + * @param key key to get + * @param key_size size of key + * @param value pointer to value + * @param value_size pointer to size of value + * @return 0 on success, -n on failure + */ +int tidesdb_txn_get(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size, uint8_t **value, size_t *value_size); + +/** + * tidesdb_txn_delete + * adds a delete operation to a transaction + * @param txn transaction handle + * @param cf column family to delete from + * @param key key to delete + * @param key_size size of key + * @return 0 on success, -n on failure + */ +int tidesdb_txn_delete(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size); + +/** + * tidesdb_txn_single_delete + * adds a single-delete operation to a transaction + * + * the caller promises that for this key there is at most one put between this + * single-delete and the previous single-delete (or the beginning). with that + * promise compaction is free to drop the put and the single-delete together + * the first merge that sees both, instead of carrying the tombstone forward + * until the largest level. this dramatically reduces tombstone accumulation + * for insert-once delete-once workloads and for secondary index maintenance. + * + * calling single-delete on a key that has been put more than once since the + * last single-delete is a contract violation and may expose older values. + * when in doubt, use tidesdb_txn_delete. + * + * for visibility and normal read semantics a single-delete behaves exactly + * like tidesdb_txn_delete. + * + * @param txn transaction handle + * @param cf column family to delete from + * @param key key to delete + * @param key_size size of key + * @return 0 on success, -n on failure + */ +int tidesdb_txn_single_delete(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, const uint8_t *key, + size_t key_size); + +/** + * tidesdb_txn_rollback + * rolls back a transaction + * @param txn transaction handle + * @return 0 on success, -n on failure + */ +int tidesdb_txn_rollback(tidesdb_txn_t *txn); + +/** + * tidesdb_txn_commit + * commits a transaction to the database + * + * multi-CF atomicity at runtime a transaction is all-or-nothing across all its column + * families -- a single commit sequence gates visibility, so nothing is visible until the one + * commit point. crash/failure atomicity differs by memtable mode, UNIFIED mode is crash-atomic + * across CFs (the whole transaction is one atomic WAL batch), whereas per-CF mode writes a + * separate WAL per CF, so a crash or IO/OOM failure mid-commit can leave a partially-applied + * prefix (the CFs written before the failure) that recovery treats as committed. use unified + * memtable mode when you need crash-atomic multi-CF transactions. + * + * @param txn transaction handle + * @return 0 on success, -n on failure + */ +int tidesdb_txn_commit(tidesdb_txn_t *txn); + +/** + * tidesdb_txn_free + * frees the transaction + * @param txn transaction handle + */ +void tidesdb_txn_free(tidesdb_txn_t *txn); + +/** + * tidesdb_txn_reset + * resets a committed or aborted transaction for reuse without freeing/reallocating buffers + * keeps the ops array, read set arrays, arenas, cfs array, and savepoints array allocated + * frees op key/value data, resets read set counts, clears hash tables, frees savepoint children + * assigns a fresh txn_id and snapshot_seq based on the new isolation level + * @param txn transaction handle (must be committed or aborted) + * @param isolation new isolation level for the reset transaction + * @return 0 on success, -n on failure + */ +int tidesdb_txn_reset(tidesdb_txn_t *txn, tidesdb_isolation_level_t isolation); + +/** + * tidesdb_txn_savepoint + * creates a savepoint in the transaction + * @param txn transaction handle + * @param name name of savepoint + * @return 0 on success, -n on failure + */ +int tidesdb_txn_savepoint(tidesdb_txn_t *txn, const char *name); + +/** + * tidesdb_txn_rollback_to_savepoint + * rolls back transaction to a savepoint + * @param txn transaction handle + * @param name name of savepoint + * @return 0 on success, -n on failure + */ +int tidesdb_txn_rollback_to_savepoint(tidesdb_txn_t *txn, const char *name); + +/** + * tidesdb_txn_release_savepoint + * releases a savepoint without rolling back + * @param txn transaction handle + * @param name name of savepoint + * @return 0 on success, -n on failure + */ +int tidesdb_txn_release_savepoint(tidesdb_txn_t *txn, const char *name); + +/** + * tidesdb_iter_new + * creates a new iterator for a specific cf in the transaction + * @param txn transaction handle + * @param cf column family to iterate + * @param iter pointer to iterator handle + * @return 0 on success, -n on failure + */ +int tidesdb_iter_new(tidesdb_txn_t *txn, tidesdb_column_family_t *cf, tidesdb_iter_t **iter); + +/** + * tidesdb_iter_seek + * seeks to a key in the iterator + * @param iter iterator handle + * @param key key to seek to + * @param key_size size of key + * @return 0 on success, -n on failure + */ +int tidesdb_iter_seek(tidesdb_iter_t *iter, const uint8_t *key, size_t key_size); + +/** + * tidesdb_iter_seek_for_prev + * seeks to a previous key in the iterator + * @param iter iterator handle + * @param key key to seek to + * @param key_size size of key + * @return 0 on success, -n on failure + */ +int tidesdb_iter_seek_for_prev(tidesdb_iter_t *iter, const uint8_t *key, size_t key_size); + +/** + * tidesdb_iter_seek_to_first + * seeks to the first key in the iterator + * @param iter iterator handle + * @return 0 on success, -n on failure + */ +int tidesdb_iter_seek_to_first(tidesdb_iter_t *iter); + +/** + * tidesdb_iter_seek_to_last + * seeks to the last key in the iterator + * @param iter iterator handle + * @return 0 on success, -n on failure + */ +int tidesdb_iter_seek_to_last(tidesdb_iter_t *iter); + +/** + * tidesdb_iter_next + * seeks to a next key in the iterator + * @param iter iterator handle + * @return 0 on success, -n on failure + */ +int tidesdb_iter_next(tidesdb_iter_t *iter); + +/** + * tidesdb_iter_prev + * seeks to a previous key in the iterator + * @param iter iterator handle + * @return 0 on success, -n on failure + */ +int tidesdb_iter_prev(tidesdb_iter_t *iter); + +/** + * tidesdb_iter_valid + * checks if an iterator is valid + * @param iter iterator handle + * @return non-zero if valid, 0 if invalid + */ +int tidesdb_iter_valid(tidesdb_iter_t *iter); + +/** + * tidesdb_iter_key + * gets a key from an iterator + * @param iter iterator handle + * @param key pointer to key + * @param key_size pointer to size of key + * @return 0 on success, -n on failure + */ +int tidesdb_iter_key(tidesdb_iter_t *iter, uint8_t **key, size_t *key_size); + +/** + * tidesdb_iter_value + * gets a value from an iterator + * @param iter iterator handle + * @param value pointer to value + * @param value_size pointer to size of value + * @return 0 on success, -n on failure + */ +int tidesdb_iter_value(tidesdb_iter_t *iter, uint8_t **value, size_t *value_size); + +/** + * tidesdb_iter_key_value + * gets both key and value from an iterator in a single call + * @param iter iterator handle + * @param key pointer to key + * @param key_size pointer to size of key + * @param value pointer to value + * @param value_size pointer to size of value + * @return 0 on success, -n on failure + */ +int tidesdb_iter_key_value(tidesdb_iter_t *iter, uint8_t **key, size_t *key_size, uint8_t **value, + size_t *value_size); + +/** + * tidesdb_iter_free + * frees an iterator + * @param iter iterator handle + */ +void tidesdb_iter_free(tidesdb_iter_t *iter); + +/** + * tidesdb_comparator_memcmp + * binary comparison using memcmp (default) + * compares keys byte-by-byte + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx unused context + * @return <0 if key1 < key2, 0 if equal, >0 if key1 > key2 + */ +int tidesdb_comparator_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_comparator_lexicographic + * lexicographic string comparison + * treats keys as null-terminated strings + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx unused context + * @return <0 if key1 < key2, 0 if equal, >0 if key1 > key2 + */ +int tidesdb_comparator_lexicographic(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_comparator_uint64 + * compares keys as 64-bit unsigned integers (little-endian) + * keys must be exactly 8 bytes + * @param key1 first key (8 bytes) + * @param key1_size size of first key (must be 8) + * @param key2 second key (8 bytes) + * @param key2_size size of second key (must be 8) + * @param ctx unused context + * @return <0 if key1 < key2, 0 if equal, >0 if key1 > key2 + */ +int tidesdb_comparator_uint64(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_comparator_int64 + * compares keys as 64-bit signed integers (little-endian) + * keys must be exactly 8 bytes + * @param key1 first key (8 bytes) + * @param key1_size size of first key (must be 8) + * @param key2 second key (8 bytes) + * @param key2_size size of second key (must be 8) + * @param ctx unused context + * @return <0 if key1 < key2, 0 if equal, >0 if key1 > key2 + */ +int tidesdb_comparator_int64(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_comparator_reverse_memcmp + * reverse binary comparison (descending order) + * useful for reverse-sorted indexes + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx unused context + * @return >0 if key1 < key2, 0 if equal, <0 if key1 > key2 + */ +int tidesdb_comparator_reverse_memcmp(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_comparator_case_insensitive + * case-insensitive string comparison + * treats keys as ASCII strings + * @param key1 first key + * @param key1_size size of first key + * @param key2 second key + * @param key2_size size of second key + * @param ctx unused context + * @return <0 if key1 < key2, 0 if equal, >0 if key1 > key2 + */ +int tidesdb_comparator_case_insensitive(const uint8_t *key1, size_t key1_size, const uint8_t *key2, + size_t key2_size, void *ctx); + +/** + * tidesdb_cf_set_commit_hook + * sets or clears the commit hook for a column family at runtime + * pass NULL for fn to disable the hook + * @param cf column family handle + * @param fn commit hook callback (or NULL to disable) + * @param ctx user-provided context passed to the callback + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS if cf is NULL + */ +int tidesdb_cf_set_commit_hook(tidesdb_column_family_t *cf, tidesdb_commit_hook_fn fn, void *ctx); + +/** + * tidesdb_compact + * runs a full compaction on a column family. every active level is merged + * into the largest so all garbage (tombstones, single-delete pairs, + * superseded puts) is reclaimed; with a single disk level the merge is a + * self-rewrite of that level. blocks until the work item has been + * serviced, including any compaction already in flight on this cf + * @param cf column family handle + * @return 0 on success, -n on failure + */ +int tidesdb_compact(tidesdb_column_family_t *cf); + +/** + * tidesdb_compact_range + * synchronously compacts every sstable in the column family whose [min_key, max_key] + * overlaps the caller supplied [start_key, end_key) range. output is merged toward the + * largest level affected by the input set, so any tombstones in the range that meet + * their dead puts are dropped during this pass. the caller blocks until the merge + * completes. intended for bulk reclaim after large range deletes -- emit point + * tombstones with tidesdb_txn_delete, then call this to physically merge them out. + * + * NULL start_key means unbounded low, NULL end_key means unbounded high. both NULL + * is rejected with TDB_ERR_INVALID_ARGS so callers go through tidesdb_compact for + * full cf compaction. + * + * @param cf column family handle + * @param start_key inclusive range start (NULL = unbounded low) + * @param start_key_size size of start_key in bytes (0 if start_key is NULL) + * @param end_key exclusive range end (NULL = unbounded high) + * @param end_key_size size of end_key in bytes (0 if end_key is NULL) + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS for bad args, TDB_ERR_LOCKED + * if another compaction is already running, or other error codes from the + * underlying merge + */ +int tidesdb_compact_range(tidesdb_column_family_t *cf, const uint8_t *start_key, + size_t start_key_size, const uint8_t *end_key, size_t end_key_size); + +/** + * tidesdb_flush_memtable + * flushes a column family's memtable to disk (sorted run to level 1) + * @param cf column family handle + * @return 0 on success, -n on failure + */ +int tidesdb_flush_memtable(tidesdb_column_family_t *cf); + +/** + * tidesdb_is_flushing + * checks if a column family is currently flushing + * @param cf column family handle + * @return 1 if flushing, 0 if not flushing + */ +int tidesdb_is_flushing(tidesdb_column_family_t *cf); + +/** + * tidesdb_is_compacting + * checks if a column family is currently compacting + * @param cf column family handle + * @return 1 if compacting, 0 if not compacting + */ +int tidesdb_is_compacting(tidesdb_column_family_t *cf); + +/** + * tidesdb_cf_config_load_from_ini + * loads the column family configuration from an INI file + * @param ini_file INI file path + * @param section_name section name in INI file + * @param config pointer to column family configuration + * @return 0 on success, -n on failure + */ +int tidesdb_cf_config_load_from_ini(const char *ini_file, const char *section_name, + tidesdb_column_family_config_t *config); + +/** + * tidesdb_cf_config_save_to_ini + * saves a column family configuration to an INI file (column family config) + * @param ini_file INI file path + * @param section_name section name in INI file + * @param config pointer to column family configuration + * @return 0 on success, -n on failure + */ +int tidesdb_cf_config_save_to_ini(const char *ini_file, const char *section_name, + const tidesdb_column_family_config_t *config); + +/** + * tidesdb_cf_update_runtime_config + * updates the runtime configuration of a column family + * @param cf column family handle + * @param new_config new configuration + * @param persist_to_disk whether to persist the configuration to disk + * @return 0 on success, -n on failure + */ +int tidesdb_cf_update_runtime_config(tidesdb_column_family_t *cf, + const tidesdb_column_family_config_t *new_config, + int persist_to_disk); + +/** + * tidesdb_get_stats + * gets the statistics of a column family + * @param cf column family handle + * @param stats pointer to statistics + * @return 0 on success, -n on failure + */ +int tidesdb_get_stats(tidesdb_column_family_t *cf, tidesdb_stats_t **stats); + +/** + * tidesdb_free_stats + * frees the statistics of the column family + * @param stats statistics + */ +void tidesdb_free_stats(tidesdb_stats_t *stats); + +/** + * tidesdb_get_db_stats + * gets database-level statistics (memory, pressure, queues, totals across all CFs) + * @param db database handle + * @param stats output parameter for database statistics (caller provides pointer to struct) + * @return 0 on success, -n on failure + */ +int tidesdb_get_db_stats(tidesdb_t *db, tidesdb_db_stats_t *stats); + +/** + * tidesdb_get_cache_stats + * gets block cache statistics for the database + * @param db database handle + * @param stats output parameter for cache statistics + * @return 0 on success, -n on failure + * @note if block cache is disabled, stats->enabled will be 0 and other fields will be zero + */ +int tidesdb_get_cache_stats(tidesdb_t *db, tidesdb_cache_stats_t *stats); + +/** + * tidesdb_backup + * backup current database to a directory. this is a best effort backup that copies immutable files + * first, then forces a sorted run, waits for the flush/compaction queues to drain, and performs a + * final copy to pick up wal's and the manifest while skipping already copied sstable files. + * @param db database handle + * @param dir destination directory for the backup + * @return 0 on success, -n on failure + */ +int tidesdb_backup(tidesdb_t *db, char *dir); + +/** + * tidesdb_checkpoint + * creates a lightweight checkpoint of the database using hard links for sstable files. + * this is much faster than a full backup since sstable files (which are immutable) are + * hard-linked rather than copied. only small metadata files (manifest, config) are copied. + * + * the checkpoint is a fully openable tidesdb database directory. + * + * algorithm: + * 1. for each column family -- we flush memtable, halt compactions + * 2. hard link all live sstable files into the checkpoint directory + * 3. copy manifest and config files + * 4. resume compactions + * + * if hard linking fails (e.g., cross-filesystem), falls back to file copy. + * + * @param db database handle + * @param checkpoint_dir destination directory for the checkpoint (must not exist or be empty) + * @return 0 on success, -n on failure + */ +int tidesdb_checkpoint(tidesdb_t *db, const char *checkpoint_dir); + +/** + * tidesdb_clone_column_family + * clones an existing column family to a new column family with a different name. + * flushes the source memtable, waits for background operations, copies all sstable files, + * and creates a new column family structure with the copied data. + * @param db database handle + * @param src_name name of the source column family to clone + * @param dst_name name for the new cloned column family + * @return TDB_SUCCESS on success, TDB_ERR_NOT_FOUND if source doesn't exist, + * TDB_ERR_EXISTS if destination already exists, or other error codes on failure + */ +int tidesdb_clone_column_family(tidesdb_t *db, const char *src_name, const char *dst_name); + +/** + * tidesdb_purge_cf + * forces a full flush of the active memtable and triggers aggressive compaction for a column + * family. waits for all flush and compaction I/O to complete before returning. this is useful for + * manual maintenance, pre-backup preparation, or reclaiming space after bulk deletes. + * @param cf column family handle + * @return 0 on success, -n on failure + */ +int tidesdb_purge_cf(tidesdb_column_family_t *cf); + +/** + * tidesdb_purge + * forces a full flush and aggressive compaction for all column families. + * waits for all flush and compaction queues to fully drain before returning. + * @param db database handle + * @return 0 on success, first non-zero error code on failure (continues processing remaining CFs) + */ +int tidesdb_purge(tidesdb_t *db); + +/** + * tidesdb_cancel_background_work + * cancels background compaction db-wide: in-flight merges bail at their next + * checkpoint (uncommitted output is discarded, inputs left intact -- recovery-safe) + * and queued compaction work is skipped. flushes are unaffected so durability is + * preserved. blocks (bounded) until compaction is idle. the cancel is sticky for + * this database session and is reset on the next tidesdb_open, so it is intended to + * be called immediately before tidesdb_close for a fast shutdown when a large + * compaction backlog would otherwise make close wait minutes to seconds. + * @param db database handle + * @return TDB_SUCCESS, or TDB_ERR_INVALID_ARGS if db is NULL + */ +int tidesdb_cancel_background_work(tidesdb_t *db); + +/** + * tidesdb_range_cost + * estimate the computational cost of iterating between two keys in a column family. + * the returned cost is an opaque double -- meaningful only for comparison with other + * values from the same function. uses only in-memory metadata (block indexes, sstable + * min/max keys, entry counts); performs no disk I/O and no iteration. + * + * when block indexes are enabled, cost is estimated via O(log B) binary search per + * overlapping sstable. when block indexes are disabled, a byte-level key interpolation + * fallback is used instead. + * + * @param cf column family + * @param key_a first key (bound of range) + * @param key_a_size size of first key + * @param key_b second key (bound of range) + * @param key_b_size size of second key + * @param cost output -- estimated traversal cost (higher = more expensive) + * @return TDB_SUCCESS on success, TDB_ERR_INVALID_ARGS on bad input + */ +int tidesdb_range_cost(tidesdb_column_family_t *cf, const uint8_t *key_a, size_t key_a_size, + const uint8_t *key_b, size_t key_b_size, double *cost); + +/** + * tidesdb_sync_wal + * forces an fsync of the active WAL for a column family. + * useful for explicit durability control when using TDB_SYNC_NONE or TDB_SYNC_INTERVAL modes. + * @param cf column family handle + * @return 0 on success, -n on failure + */ +int tidesdb_sync_wal(tidesdb_column_family_t *cf); + +/** + * tidesdb_free + * frees a pointer allocated by TidesDB + * @param ptr pointer to free + */ +void tidesdb_free(void *ptr); + +#endif /* __TIDESDB_H__ */