Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
da46c22
Keep vendor cell_id in spatial_unlabelled as a segmentation prior
EliHei2 May 18, 2026
a36fb57
Add segger method component
EliHei2 May 18, 2026
c2a3268
Apply suggestions from code review
rcannood May 18, 2026
b9c4803
Merge remote-tracking branch 'origin/main' into segger
rcannood May 18, 2026
3bc10ee
Segger - use pytorch 25.04 instead of 26.02 as a base image (#33)
rcannood May 19, 2026
5f74b6a
Fix segger CI: replace cellpose min_size=-1 with 0 for NumPy 2.x
EliHei2 May 19, 2026
1c10f12
Filter -1 / UNASSIGNED cell_id sentinels before rasterization
EliHei2 May 19, 2026
11d203c
Align segger CLI flags with dpeerlab/segger@main
EliHei2 May 19, 2026
770ca1b
Skip cudf driver validation on GPU-less runners
EliHei2 May 19, 2026
b9ae3e5
CPU stub-and-exit for segger + restore cuspatial install
EliHei2 May 19, 2026
d8765d3
Warn early when no CUDA GPU is detected for segger
EliHei2 May 19, 2026
29d16d1
Hard-fail segger when no CUDA GPU is available
EliHei2 May 19, 2026
6e0ccd7
Merge remote-tracking branch 'origin/main' into segger
rcannood May 19, 2026
bbb5f30
bypass run test for segger
rcannood May 19, 2026
c520074
add segger to wf
rcannood May 19, 2026
8c8560a
use different label
rcannood May 19, 2026
b5da903
Merge remote-tracking branch 'origin/main' into segger
EliHei2 May 19, 2026
3c2e808
Export segger's full per-cell footprint, boundaries, and per-cell table
EliHei2 May 19, 2026
48aed73
Minimal label image for segger: pixel-paint per assigned transcript
EliHei2 May 19, 2026
4b3ddc8
Set production defaults for --init_segmentation and --cellpose_diameter
EliHei2 May 19, 2026
49b7938
Switch segger raster to cKDTree-Voronoi for filled cell masks
EliHei2 May 19, 2026
113079d
Replace segger's opencv-python with headless wheel
EliHei2 May 19, 2026
a809166
add missing packages
rcannood May 19, 2026
8bdad1f
fix config
rcannood May 19, 2026
e4d6a33
removed unneeded deps?
rcannood May 19, 2026
8782a00
Merge branch 'main' into segger
rcannood May 19, 2026
9dd0a20
revert cv2 issues
rcannood May 19, 2026
c22dc4b
Write boundary cell_id as string for segger's preprocessor
EliHei2 May 19, 2026
2baf179
Trust per-transcript cell_id prior directly; fail loud on empty signal
EliHei2 May 19, 2026
f76c8ab
Drop transcripts whose cell has no boundary polygon
EliHei2 May 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions src/methods/segger/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ engines:
- procps
- git
- libxcb1
- libgl1
- type: python
packages:
- anndata~=0.12.0
Expand Down Expand Up @@ -112,10 +113,6 @@ engines:
# need the spatialdata-loader branch — segger@main is enough.
- type: python
github: dpeerlab/segger
# - type: docker
# run: |
# pip install --no-cache-dir \
# "git+https://github.com/dpeerlab/segger.git@main#egg=segger"
# __merge__: /src/base/setup_spatialdata_partial.yaml
- type: native

Expand Down
90 changes: 83 additions & 7 deletions src/methods/segger/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,19 @@ def _rasterize_polygons(

def _shapes_to_xenium_vertices(shapes_gdf: gpd.GeoDataFrame) -> pl.DataFrame:
"""Xenium boundary parquet schema: one row per polygon vertex with
columns (cell_id, vertex_x, vertex_y)."""
columns (cell_id, vertex_x, vertex_y).

`cell_id` is written as a string to match the native Xenium schema —
segger's preprocessor composes a per-row index like
`cell_id + '_' + boundary_type` and crashes with a numpy `add` ufunc
error if `cell_id` arrives as int64."""
rows = []
for cid, geom in zip(shapes_gdf["cell_id"].to_numpy(), shapes_gdf.geometry.to_numpy()):
if geom is None or geom.is_empty or geom.geom_type != "Polygon":
continue
cid_str = str(cid)
for vx, vy in np.asarray(geom.exterior.coords):
rows.append((int(cid), float(vx), float(vy)))
rows.append((cid_str, float(vx), float(vy)))
return pl.DataFrame(rows, schema=["cell_id", "vertex_x", "vertex_y"], orient="row")


Expand All @@ -179,16 +185,39 @@ def _build_xenium_layout(
initial_labels: np.ndarray,
image_element,
out_dir: Path,
tx_cell_id_override: np.ndarray | None = None,
) -> Path:
"""Materialize a directory that segger's `10x_xenium` preprocessor can
consume: `transcripts.parquet` + matching `cell_boundaries.parquet` /
`nucleus_boundaries.parquet`. We don't have a real cell vs nucleus split,
so the same polygon set is written to both — segger then drives the
prediction graph off whichever one `--prediction-mode` selects."""
prediction graph off whichever one `--prediction-mode` selects.

`tx_cell_id_override` is the per-transcript training assignment as a
string array ("UNASSIGNED" sentinel for unknown). When supplied (the
`transcript_cell_id` init mode, which carries a vendor cell_id prior
on the transcripts) we skip the rasterize-then-pixel-lookup roundtrip
and trust the prior directly — otherwise tiny convex hulls would
rasterize to zero pixels and silently mark every transcript as
UNASSIGNED, which made segger's anndata builder explode on empty input.
When omitted (the Cellpose path) we derive assignments by looking
each transcript up in the rasterized nucleus mask as before."""
out_dir.mkdir(parents=True, exist_ok=True)

tx_init = _initial_cell_id_per_transcript(tx_pd, initial_labels, image_element)
cell_id_str = np.where(tx_init > 0, tx_init.astype(str), "UNASSIGNED")
if tx_cell_id_override is not None:
cell_id_str = np.asarray(tx_cell_id_override, dtype=object)
cell_id_str = np.where(
cell_id_str == None, # noqa: E711 — np.where needs ==, not `is`
"UNASSIGNED",
cell_id_str.astype(str),
)
overlaps_nucleus = (cell_id_str != "UNASSIGNED").astype(np.int8)
else:
tx_init = _initial_cell_id_per_transcript(
tx_pd, initial_labels, image_element
)
cell_id_str = np.where(tx_init > 0, tx_init.astype(str), "UNASSIGNED")
overlaps_nucleus = (tx_init > 0).astype(np.int8)

tx_out = pd.DataFrame({
"transcript_id": (
Expand All @@ -208,7 +237,7 @@ def _build_xenium_layout(
# Treat the entire mask as nucleus (overlaps_nucleus=1); transcripts
# off the mask get 0. segger only uses this when prediction_mode
# distinguishes cell vs nucleus.
"overlaps_nucleus": (tx_init > 0).astype(np.int8),
"overlaps_nucleus": overlaps_nucleus,
})
if "z" in tx_pd.columns:
tx_out.insert(3, "z_location", tx_pd["z"].to_numpy().astype(np.float32))
Expand Down Expand Up @@ -340,9 +369,28 @@ def _rasterize_via_transcript_voronoi(

initial_labels: np.ndarray
shapes_gdf: gpd.GeoDataFrame
tx_cell_id_override: np.ndarray | None = None
if mode == "transcript_cell_id":
shapes_gdf = _polygons_from_cell_ids(tx_pd)
initial_labels = _rasterize_polygons(shapes_gdf, image_el, label_col="cell_id")
# Trust the input's per-transcript cell_id prior directly for segger's
# training signal — converting via rasterize-then-pixel-lookup loses
# transcripts whose cell's convex hull rasterizes to zero pixels.
# Critically: a transcript's cell_id is only valid if that cell ALSO
# has a boundary polygon in shapes_gdf. _polygons_from_cell_ids skips
# cells with < 3 transcripts (can't make a hull), and segger's
# setup_anndata left-joins obs against boundaries by cell_id — any
# cell present in transcripts but absent from boundaries produces
# NaN obs index and trips the `assert ~obs.index.isna().any()`.
cell_ids_with_boundary = set(shapes_gdf["cell_id"].astype(str).to_numpy())
valid = _valid_cell_id_mask(tx_pd["cell_id"])
tx_cid_str = tx_pd["cell_id"].astype(str).to_numpy()
has_boundary = np.array([c in cell_ids_with_boundary for c in tx_cid_str])
tx_cell_id_override = np.where(
valid.to_numpy() & has_boundary,
tx_cid_str,
"UNASSIGNED",
)
elif mode == "cellpose":
initial_labels, shapes_gdf = _polygons_from_cellpose(image_el, par["cellpose_diameter"])
else:
Expand All @@ -356,7 +404,35 @@ def _rasterize_via_transcript_voronoi(
# initial mask id as `cell_id` (segger's training signal), and the polygons
# are written to both cell_boundaries.parquet and nucleus_boundaries.parquet
# since segger expects both Xenium files to exist.
_build_xenium_layout(tx_pd, shapes_gdf, initial_labels, image_el, xenium_dir)
_build_xenium_layout(
tx_pd,
shapes_gdf,
initial_labels,
image_el,
xenium_dir,
tx_cell_id_override=tx_cell_id_override,
)

# Fail loudly with diagnostics if no transcript carries a valid cell
# assignment — segger's anndata builder will crash with an opaque
# `cannot infer dimensions from zero sized index arrays` otherwise.
_tx_written = pd.read_parquet(xenium_dir / "transcripts.parquet", columns=["cell_id"])
_n_assigned = (_tx_written["cell_id"].astype(str) != "UNASSIGNED").sum()
_n_total = len(_tx_written)
_n_cells = _tx_written.loc[
_tx_written["cell_id"].astype(str) != "UNASSIGNED", "cell_id"
].nunique()
print(
f"transcripts.parquet: {_n_assigned}/{_n_total} assigned across "
f"{_n_cells} cells",
flush=True,
)
if _n_assigned == 0 or _n_cells == 0:
raise RuntimeError(
"No transcripts carry a valid cell_id assignment — segger has no "
"training signal. Check that the input transcripts have a usable "
"cell_id prior, or rerun with --init_segmentation cellpose."
)
seg_pq = _run_segger(xenium_dir, segger_out_dir)
seg = pl.read_parquet(seg_pq)
print(f"segger emitted {seg.height} rows", flush=True)
Expand Down
Loading