Skip to content

Commit 63c0817

Browse files
feat(kernel): wire CUJ-gap fixes — staging fail-loud, error context, sync cancel
Connector half of the API+CUJ audit fixes (kernel half: databricks-sql-kernel PR #121). Bumps KERNEL_REV to pick up the kernel surface. Staging fail-loud (kernel/client.py): - Volume/staging PUT/GET/REMOVE silently no-op'd on the kernel path (KernelResultSet.is_staging_operation is always False, so the connector's _handle_staging_operation never fired and no file was transferred). Detect the leading verb in execute_command and raise NotSupportedError so ETL fails loud instead of ingesting stale data. Error context (_errors.py): - Forward display_message / diagnostic_info / error_details_json (now exposed across the pyo3 boundary in #121) onto the re-raised PEP-249 exception, and populate ServerOperationError.context with "diagnostic-info" (Spark stack trace) + "operation-id" — matching the Thrift backend so callers reading err.context work identically. Sync cancel wiring (client.py, kernel/client.py): - cursor.cancel() was a silent no-op for the default blocking execute() (active_command_id is None until execute returns). The kernel backend now registers a detached StatementCanceller (keyed by the cursor) before the blocking execute and exposes cancel_running_cursor(cursor). Cursor.cancel() routes to that hook via getattr when there's no command id yet — opt-in, so Thrift/SEA backends are unaffected. Tests: unit (staging fail-loud, _is_staging_statement, cancel registry + routing, error context/diagnostic-info forwarding) and e2e (tz-aware TIMESTAMP, scientific DECIMAL, staging NotSupportedError, diagnostic-info context, cross-thread sync cancel interrupts a running query). All e2e verified live against dogfood with use_kernel=True. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala <vikrant.puppala@databricks.com>
1 parent 614dd91 commit 63c0817

7 files changed

Lines changed: 436 additions & 10 deletions

File tree

KERNEL_REV

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
101aa465e71991eec98102bba77aad2f7ad8faed
1+
f17a302de06fcee3dea02089474ee2d71725a136

src/databricks/sql/backend/kernel/_errors.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,23 @@ def reraise_kernel_error(exc: "_kernel.KernelError") -> "Error":
9797
"""
9898
code = getattr(exc, "code", "Unknown")
9999
cls = _CODE_TO_EXCEPTION.get(code, DatabaseError)
100-
new = cls(getattr(exc, "message", str(exc)))
100+
101+
# For ServerOperationError, reproduce the Thrift backend's
102+
# ``context`` dict so callers that read
103+
# ``err.context["diagnostic-info"]`` (the Spark stack trace) /
104+
# ``err.context["operation-id"]`` get the same shape on the kernel
105+
# path. ``diagnostic_info`` is forwarded from the kernel error (it
106+
# now crosses the PyO3 boundary; older wheels return ``None`` via
107+
# ``getattr``, so this degrades gracefully). Matches
108+
# thrift_backend.py's ServerOperationError construction.
109+
context = None
110+
if cls is ServerOperationError:
111+
context = {
112+
"operation-id": getattr(exc, "query_id", None),
113+
"diagnostic-info": getattr(exc, "diagnostic_info", None),
114+
}
115+
new = cls(getattr(exc, "message", str(exc)), context)
116+
101117
for attr in (
102118
"code",
103119
"sql_state",
@@ -106,6 +122,12 @@ def reraise_kernel_error(exc: "_kernel.KernelError") -> "Error":
106122
"http_status",
107123
"retryable",
108124
"query_id",
125+
# Extended server status now forwarded across the PyO3 boundary
126+
# (kernel #121). ``getattr(..., None)`` keeps this forward-safe
127+
# against an older wheel that doesn't set these attrs.
128+
"display_message",
129+
"diagnostic_info",
130+
"error_details_json",
109131
):
110132
setattr(new, attr, getattr(exc, attr, None))
111133
return new

src/databricks/sql/backend/kernel/client.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,24 @@
7171
# per-request skip-and-warn.
7272
_KERNEL_MANAGED_HEADERS = frozenset({"authorization", "x-databricks-org-id"})
7373

74+
# Leading verbs of SQL volume/staging statements. Detected by the
75+
# leading token (case-insensitive) so the kernel backend can fail loud
76+
# on staging ops it can't service — see ``execute_command``.
77+
_STAGING_VERBS = ("PUT", "GET", "REMOVE")
78+
79+
80+
def _is_staging_statement(operation: str) -> bool:
81+
"""True iff ``operation`` is a volume/staging statement (PUT / GET /
82+
REMOVE).
83+
84+
Matches the leading token only, so a normal query that merely
85+
*contains* the word (e.g. ``SELECT 'GET' AS x``) isn't misflagged.
86+
"""
87+
stripped = operation.lstrip()
88+
# First whitespace-delimited token, uppercased.
89+
verb = stripped.split(None, 1)[0].upper() if stripped else ""
90+
return verb in _STAGING_VERBS
91+
7492

7593
# ─── Client ─────────────────────────────────────────────────────────────────
7694

@@ -172,6 +190,17 @@ def __init__(
172190
# path. Same lock as ``_async_handles``.
173191
self._closed_commands: Set[str] = set()
174192
self._async_handles_lock = threading.RLock()
193+
# Sync-execute cancellers keyed by ``id(cursor)``. A blocking
194+
# ``execute()`` sets ``cursor.active_command_id`` only AFTER it
195+
# returns, so a concurrent ``cursor.cancel()`` (the documented
196+
# cross-thread PEP-249 shape) has no command id to target while
197+
# the query runs. We register a detached kernel
198+
# ``StatementCanceller`` here just before the blocking call and
199+
# drop it after; ``cancel_running_cursor`` (invoked by
200+
# ``Cursor.cancel`` when there's no command id yet) fires it.
201+
# Guarded by its own lock — cancel can race execute teardown.
202+
self._sync_cancellers: Dict[int, Any] = {}
203+
self._sync_cancellers_lock = threading.RLock()
175204

176205
# ── Session lifecycle ──────────────────────────────────────────
177206

@@ -354,6 +383,24 @@ def execute_command(
354383
# ``_async_statements`` and closed by ``close_command``); the sync
355384
# path drops it in finally. ``close_stmt`` is the post-success
356385
# decision flag — it stays True on sync, flips to False on async.
386+
# Volume/staging (PUT/GET/REMOVE) is not supported on the kernel
387+
# path: the kernel returns the staging control row as a normal
388+
# result set (``KernelResultSet.is_staging_operation`` is always
389+
# False), so the connector's ``_handle_staging_operation`` never
390+
# fires and NO file is transferred. Rather than silently no-op
391+
# (the Thrift path performs the presigned-URL upload/download),
392+
# fail loud at the call site so ETL scripts don't ingest
393+
# stale/missing data. Detected by the leading SQL verb — the
394+
# only signal available pre-execute, since the kernel exposes no
395+
# staging marker today.
396+
if _is_staging_statement(operation):
397+
raise NotSupportedError(
398+
"Volume / staging operations (PUT / GET / REMOVE) are not "
399+
"supported on the kernel backend (use_kernel=True); the file "
400+
"transfer would silently not happen. Use the Thrift backend "
401+
"for staging operations."
402+
)
403+
357404
close_stmt = True
358405
try:
359406
try:
@@ -382,10 +429,23 @@ def execute_command(
382429
self._async_statements[command_id.guid] = stmt
383430
close_stmt = False
384431
return None
432+
# Register a detached canceller BEFORE the blocking
433+
# execute so a concurrent ``cursor.cancel()`` can reach
434+
# the running statement (its server id is populated mid-
435+
# execute). Keyed by ``id(cursor)`` since no command id
436+
# exists yet. Dropped in the finally.
437+
try:
438+
with self._sync_cancellers_lock:
439+
self._sync_cancellers[id(cursor)] = stmt.canceller()
440+
except Exception:
441+
# Canceller is best-effort; never block execute on it.
442+
pass
385443
executed = stmt.execute()
386444
except Exception as exc:
387445
raise _wrap_kernel_exception("execute_command", exc) from exc
388446
finally:
447+
with self._sync_cancellers_lock:
448+
self._sync_cancellers.pop(id(cursor), None)
389449
if close_stmt:
390450
# Sync path: ``Statement`` is a lifecycle owner separate
391451
# from the executed handle. Drop it here so the parent
@@ -422,6 +482,30 @@ def cancel_command(self, command_id: CommandId) -> None:
422482
except Exception as exc:
423483
raise _wrap_kernel_exception("cancel_command", exc) from exc
424484

485+
def cancel_running_cursor(self, cursor: "Cursor") -> bool:
486+
"""Cancel an in-flight SYNC ``execute()`` on ``cursor``.
487+
488+
Invoked by ``Cursor.cancel()`` when ``active_command_id`` is
489+
still ``None`` — i.e. a blocking ``execute()`` hasn't returned,
490+
so the command id isn't set yet but the server statement may be
491+
running. Fires the detached ``StatementCanceller`` registered in
492+
``execute_command`` before the blocking call.
493+
494+
Returns ``True`` if a canceller was found and fired (the
495+
statement was in flight), ``False`` otherwise so ``Cursor`` can
496+
emit its "no executing command" warning. Safe to call from
497+
another thread.
498+
"""
499+
with self._sync_cancellers_lock:
500+
canceller = self._sync_cancellers.get(id(cursor))
501+
if canceller is None:
502+
return False
503+
try:
504+
canceller.cancel()
505+
except Exception as exc:
506+
raise _wrap_kernel_exception("cancel_running_cursor", exc) from exc
507+
return True
508+
425509
def close_command(self, command_id: CommandId) -> None:
426510
with self._async_handles_lock:
427511
handle = self._async_handles.pop(command_id.guid, None)

src/databricks/sql/client.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,11 +1704,22 @@ def cancel(self) -> None:
17041704
"""
17051705
if self.active_command_id is not None:
17061706
self.backend.cancel_command(self.active_command_id)
1707-
else:
1708-
logger.warning(
1709-
"Attempting to cancel a command, but there is no "
1710-
"currently executing command"
1711-
)
1707+
return
1708+
# No command id yet. A backend whose synchronous ``execute()``
1709+
# blocks without first publishing a command id (the kernel
1710+
# backend) may still have a server statement in flight. Such a
1711+
# backend exposes ``cancel_running_cursor(cursor)`` -> bool to
1712+
# cancel it; it returns True if something was actually
1713+
# cancelled. Opt-in via getattr so the Thrift / SEA backends
1714+
# (which set ``active_command_id`` before blocking) are
1715+
# unaffected.
1716+
cancel_running_cursor = getattr(self.backend, "cancel_running_cursor", None)
1717+
if cancel_running_cursor is not None and cancel_running_cursor(self):
1718+
return
1719+
logger.warning(
1720+
"Attempting to cancel a command, but there is no "
1721+
"currently executing command"
1722+
)
17121723

17131724
def close(self) -> None:
17141725
"""Close cursor"""

tests/e2e/test_kernel_backend.py

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import pytest
2525

2626
import databricks.sql as sql
27-
from databricks.sql.exc import DatabaseError
27+
from databricks.sql.exc import DatabaseError, NotSupportedError, ServerOperationError
2828

2929
# Skip the whole module unless the kernel wheel is genuinely installed.
3030
# ``pytest.importorskip`` alone isn't enough: the kernel unit tests inject a
@@ -361,3 +361,108 @@ def test_user_agent_entry_and_http_headers_round_trip(kernel_conn_params):
361361
with c.cursor() as cur:
362362
cur.execute("SELECT 1 AS n")
363363
assert cur.fetchall()[0][0] == 1
364+
365+
366+
# ── Parameter parity (tz-aware timestamp, scientific decimal) ──────
367+
368+
369+
def test_tz_aware_timestamp_parameter_binds(conn):
370+
"""A tz-aware datetime parameter (datetime with tzinfo) binds on
371+
the kernel path and resolves to the correct UTC instant. Previously
372+
rejected at bind on kernel; works on Thrift. (kernel #121)"""
373+
import datetime
374+
375+
tzdt = datetime.datetime(
376+
2026,
377+
5,
378+
15,
379+
18,
380+
0,
381+
0,
382+
tzinfo=datetime.timezone(datetime.timedelta(hours=5, minutes=30)),
383+
)
384+
with conn.cursor() as cur:
385+
cur.execute("SELECT ? AS ts", [tzdt])
386+
ts = cur.fetchall()[0][0]
387+
# 18:00 +05:30 == 12:30 UTC.
388+
assert (ts.hour, ts.minute) == (12, 30)
389+
390+
391+
def test_scientific_notation_decimal_parameter_binds(conn):
392+
"""A Decimal whose str() is exponential (e.g. 1E-7) binds on the
393+
kernel path. Previously rejected at bind; the server/Thrift accept
394+
scientific-notation decimal literals. (kernel #121)"""
395+
import decimal
396+
from databricks.sql.parameters.native import DecimalParameter
397+
398+
with conn.cursor() as cur:
399+
# 1E+2 == 100, round-trips cleanly at scale 0.
400+
cur.execute("SELECT ? AS d", [DecimalParameter(decimal.Decimal("1E+2"))])
401+
assert int(cur.fetchall()[0][0]) == 100
402+
403+
404+
# ── Staging / volume — fail loud, not silent no-op ────────────────
405+
406+
407+
def test_staging_put_raises_not_supported(conn):
408+
"""A PUT (volume/staging) statement fails loud on the kernel path
409+
rather than silently no-opping (which would make ETL ingest
410+
stale/missing data). (CUJ-gap audit)"""
411+
with conn.cursor() as cur:
412+
with pytest.raises(NotSupportedError, match="staging"):
413+
cur.execute("PUT '/tmp/x.csv' INTO '/Volumes/main/default/v/x.csv'")
414+
415+
416+
# ── Error fidelity — diagnostic-info reaches .context ─────────────
417+
418+
419+
def test_server_error_exposes_diagnostic_info_context(conn):
420+
"""A server-side query failure surfaces as ServerOperationError
421+
with the Spark diagnostic context in ``.context['diagnostic-info']``
422+
— Thrift parity (kernel #121 forwards diagnostic_info across pyo3;
423+
the connector populates .context)."""
424+
with conn.cursor() as cur:
425+
with pytest.raises(ServerOperationError) as exc_info:
426+
cur.execute("SELECT * FROM definitely_not_a_table_xyz_kernel_e2e")
427+
err = exc_info.value
428+
# context shape matches Thrift; diagnostic-info may be None if
429+
# the server didn't attach one, but the KEY must exist.
430+
assert "diagnostic-info" in err.context
431+
assert "operation-id" in err.context
432+
433+
434+
# ── Sync cancel (cursor.cancel() from another thread) ─────────────
435+
436+
437+
def test_sync_cancel_interrupts_blocking_execute(conn):
438+
"""cursor.cancel() from another thread cancels a long-running
439+
blocking execute() on the kernel path. Previously a silent no-op
440+
(active_command_id was None until execute returned). (kernel #121
441+
StatementCanceller + connector cancel_running_cursor wiring)"""
442+
import threading
443+
import time
444+
445+
cur = conn.cursor()
446+
447+
# The kernel publishes the server statement id once the initial
448+
# POST returns — within the server's default wait window (~10s).
449+
# Cancel after that so the canceller has an id to target; a cancel
450+
# before then is a no-op by design (id not yet known). Pick a query
451+
# that runs well past this so the cancel lands mid-flight.
452+
def cancel_after_delay():
453+
time.sleep(15.0)
454+
cur.cancel()
455+
456+
t = threading.Thread(target=cancel_after_delay)
457+
t.start()
458+
try:
459+
# Cancel should make execute() raise rather than run to
460+
# completion — proving the server-side statement was cancelled.
461+
with pytest.raises(Exception):
462+
cur.execute(
463+
"SELECT count(*) FROM range(0, 1000000000000) "
464+
"WHERE pow(rand(), 2) < 0.5 AND sqrt(id) > 1"
465+
)
466+
finally:
467+
t.join()
468+
cur.close()

tests/unit/test_client.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -371,13 +371,46 @@ def test_cancel_command_calls_the_backend(self):
371371
def test_cancel_command_will_issue_warning_for_cancel_with_no_executing_command(
372372
self, logger_instance
373373
):
374-
mock_thrift_backend = Mock()
374+
# Backends like Thrift/SEA set active_command_id before blocking
375+
# and do NOT define ``cancel_running_cursor``. Use ``spec`` so the
376+
# mock doesn't auto-advertise that opt-in hook (a bare ``Mock()``
377+
# returns a truthy Mock for any attribute).
378+
mock_thrift_backend = Mock(spec=["cancel_command"])
375379
cursor = client.Cursor(Mock(), mock_thrift_backend)
376380
cursor.cancel()
377381

378382
self.assertTrue(logger_instance.warning.called)
379383
self.assertFalse(mock_thrift_backend.cancel_command.called)
380384

385+
@patch("databricks.sql.client.logger")
386+
def test_cancel_routes_to_cancel_running_cursor_when_no_command_id(
387+
self, logger_instance
388+
):
389+
"""When there's no active_command_id (a blocking sync execute()
390+
hasn't published one), cancel() routes to the backend's opt-in
391+
``cancel_running_cursor`` hook (the kernel backend). If the hook
392+
cancels something (returns True), no warning is emitted."""
393+
backend = Mock(spec=["cancel_command", "cancel_running_cursor"])
394+
backend.cancel_running_cursor.return_value = True
395+
cursor = client.Cursor(Mock(), backend)
396+
cursor.cancel()
397+
398+
backend.cancel_running_cursor.assert_called_once_with(cursor)
399+
self.assertFalse(backend.cancel_command.called)
400+
self.assertFalse(logger_instance.warning.called)
401+
402+
@patch("databricks.sql.client.logger")
403+
def test_cancel_warns_when_hook_finds_nothing_in_flight(self, logger_instance):
404+
"""If the hook returns False (nothing was in flight), the
405+
existing 'no executing command' warning still fires."""
406+
backend = Mock(spec=["cancel_command", "cancel_running_cursor"])
407+
backend.cancel_running_cursor.return_value = False
408+
cursor = client.Cursor(Mock(), backend)
409+
cursor.cancel()
410+
411+
backend.cancel_running_cursor.assert_called_once_with(cursor)
412+
self.assertTrue(logger_instance.warning.called)
413+
381414
def test_version_is_canonical(self):
382415
version = databricks.sql.__version__
383416
canonical_version_re = (
@@ -510,7 +543,7 @@ def test_column_name_api(self):
510543

511544
expected_values = [["val1", 321, 52.32], ["val2", 2321, 252.32]]
512545

513-
for (row, expected) in zip(data, expected_values):
546+
for row, expected in zip(data, expected_values):
514547
self.assertEqual(row.first_col, expected[0])
515548
self.assertEqual(row.second_col, expected[1])
516549
self.assertEqual(row.third_col, expected[2])

0 commit comments

Comments
 (0)