From da46c222df90102fe1fa66b655063434f374284c Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 18 May 2026 12:54:19 +0200 Subject: [PATCH 01/26] Keep vendor cell_id in spatial_unlabelled as a segmentation prior The raw transcripts carry a vendor (e.g. Xenium morphology) `cell_id` that segmentation methods condition on as a prior. The previous process_dataset stripped it alongside `nucleus_id`/`cell_type`, leaving methods with no prior signal. Stop stripping `cell_id` so it flows through to spatial_unlabelled, and declare it as an optional integer column on the transcripts in src/api/file_spatial_unlabelled.yaml. The held-out ground truth used for evaluation remains in spatial_solution. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 2 +- src/api/file_spatial_unlabelled.yaml | 8 ++++++++ src/data_processors/process_dataset/script.py | 7 +++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7158473..ad453c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ## NEW FUNCTIONALITY -* ... +* Preserve the vendor `cell_id` on transcripts in `spatial_unlabelled` as an optional segmentation prior, instead of stripping it together with the held-out ground-truth columns. Declared as an optional integer column in `src/api/file_spatial_unlabelled.yaml`. ## MAJOR CHANGES diff --git a/src/api/file_spatial_unlabelled.yaml b/src/api/file_spatial_unlabelled.yaml index 3d86b0f..278e953 100644 --- a/src/api/file_spatial_unlabelled.yaml +++ b/src/api/file_spatial_unlabelled.yaml @@ -47,6 +47,14 @@ info: name: overlaps_nucleus required: false description: Whether the point overlaps with the nucleus (derived from morphology) + - type: integer + name: cell_id + required: false + description: | + Vendor-provided cell assignment from the raw data, exposed as a + segmentation prior. This is NOT the ground truth used for + evaluation (which is held out in spatial_solution); methods may + freely condition on it. tables: - type: anndata name: "table" diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index e1174a2..84daf74 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -92,8 +92,11 @@ def sc_processing(adata): # --------------------------------------------------------------- print(">> Building spatial dataset for methods (no ground truth)", flush=True) -# Strip columns that reveal ground truth cell assignments from transcripts -_GROUND_TRUTH_COLS = {"cell_id", "nucleus_id", "cell_type"} +# Strip ground-truth-revealing columns, but keep the vendor `cell_id` as a +# segmentation prior — most methods (e.g. segger) condition on the vendor's +# morphology-based assignment without treating it as ground truth. The held-out +# ground truth used for evaluation lives in spatial_solution, not here. +_GROUND_TRUTH_COLS = {"nucleus_id", "cell_type"} transcripts = sp_data.points["transcripts"] clean_transcript_cols = [c for c in transcripts.columns if c not in _GROUND_TRUTH_COLS] clean_transcripts = transcripts[clean_transcript_cols] From a36fb57404f3d4fed8690e03d3320f0ea3389733 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Mon, 18 May 2026 13:13:59 +0200 Subject: [PATCH 02/26] Add segger method component MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps segger (graph neural network transcript-to-cell segmentation) as a Viash method that consumes the standard spatial_unlabelled SpatialData zarr and produces a Labels2DModel prediction. Initial boundary set is taken from the transcripts' `cell_id` prior (the vendor morphology assignment that `process_dataset` now keeps in spatial_unlabelled) when present and falls back to Cellpose on `morphology_mip` otherwise. The initial polygons plus the original transcripts are materialized as a Xenium-style directory (transcripts.parquet + cell_boundaries.parquet + nucleus_boundaries.parquet) so segger's stock `10x_xenium` preprocessor drives the prediction graph — no SpatialData-loader branch of segger required. `segger segment` is run from `dpeerlab/segger@main`. The per-transcript segger_cell_id assignments are projected back onto the initial mask by majority vote and rasterized as the final segmentation. Docker engine pins the dependency stack that segger is known to work with on the openproblems base image: - PyG wheels matched to torch 2.5+cu121 - RAPIDS 24.10 (last release binary-compatible with the base image's CUDA 12.1 runtime) - Cellpose for the nucleus pre-segmentation Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/config.vsh.yaml | 129 +++++++++++ src/methods/segger/script.py | 347 +++++++++++++++++++++++++++++ 2 files changed, 476 insertions(+) create mode 100644 src/methods/segger/config.vsh.yaml create mode 100644 src/methods/segger/script.py diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml new file mode 100644 index 0000000..f28d0f0 --- /dev/null +++ b/src/methods/segger/config.vsh.yaml @@ -0,0 +1,129 @@ +__merge__: /src/api/comp_method.yaml + +name: segger + +label: "Segger" +summary: Assigns transcripts to cells via a heterogeneous graph neural network with link prediction between transcripts and morphology-derived boundary nodes. +description: | + Segger frames spatial transcriptomics cell segmentation as link prediction on a heterogeneous graph + whose nodes are individual transcripts and morphology-derived cell or nucleus boundaries, and whose + edges connect transcripts to each other within a small radius (capturing local co-occurrence) and + transcripts to boundaries when a transcript falls inside a scaled polygon. A graph neural network + is trained with a contrastive objective so that each transcript embedding is pulled toward the + embedding of its true boundary and pushed away from neighboring ones. At inference time, every + transcript is reassigned to the boundary it most resembles in embedding space, subject to a + similarity threshold that is either set globally or learned per gene. Because the model operates + on the transcript graph rather than on raster images, it refines the initial nucleus segmentation + with information about which transcripts genuinely belong to which cell, and can rescue transcripts + that the initial boundary segmentation missed. In this Viash component, an initial boundary + set is taken from the vendor `cell_id` prior on the transcripts whenever it is present and + falls back to a Cellpose pass on `morphology_mip` when it is not; the boundaries plus the + transcripts are written to a Xenium-layout directory and handed to `segger segment`. The + per-transcript reassignments are then projected back onto the initial nucleus masks by + majority vote and rasterized as the predicted label image. +links: + documentation: "https://github.com/dpeerlab/segger" + repository: "https://github.com/dpeerlab/segger" +references: + doi: "10.1101/2025.03.14.643160" + +arguments: + - name: --init_segmentation + type: string + choices: [auto, cellpose, transcript_cell_id] + description: | + Source of the initial boundary node set that segger trains against. + 'auto' uses the transcripts' `cell_id` prior column when it is + present and runs Cellpose on `morphology_mip` otherwise. + 'cellpose' always runs Cellpose. 'transcript_cell_id' requires a + `cell_id` column on the transcripts and builds one convex-hull + polygon per cell id. + info: + test_default: auto + - name: --n_epochs + type: integer + description: "Number of training epochs for the GNN." + info: + test_default: 1 + - name: --prediction_scale_factor + type: double + description: "Multiplier applied to nucleus polygons when forming transcript-to-boundary edges at prediction time. Values >1 expand the polygon to capture peripheral transcripts." + info: + test_default: 2.2 + - name: --prediction_mode + type: string + choices: [nucleus, cell, uniform] + description: "Which polygon set drives the prediction graph." + info: + test_default: nucleus + - name: --min_similarity + type: double + description: "Fixed similarity threshold in [0,1] for keeping a transcript-boundary assignment. If unset, segger uses per-gene Li+Yen thresholds." + info: + test_default: 0.0 + - name: --fragment_mode + type: boolean_true + description: "Run fragment mode to group otherwise-unassigned transcripts into connected components." + - name: --cellpose_diameter + type: double + description: "Cell diameter (pixels) for the Cellpose nucleus pre-segmentation." + info: + test_default: 30 + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1 + setup: + # PyG extensions matching the base image's PyTorch 2.5 + CUDA 12.1 stack + # (per segger/environment_cuda121.yml on the integration/peerlab branch). + - type: docker + run: | + pip install --no-cache-dir \ + --find-links https://data.pyg.org/whl/torch-2.5.0+cu121.html \ + torch-scatter torch-sparse torch-geometric lightning + # RAPIDS 24.10 — the last release that is binary-compatible with the + # CUDA 12.1 runtime that the base image ships. Newer 24.12/25.x wheels + # require CUDA 12.5+ and segfault on this image. Optional at runtime + # (segger has CPU fallbacks) but required for the prediction graph + # KD-tree to run on GPU at realistic dataset sizes. + - type: docker + run: | + pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com \ + "cuspatial-cu12==24.10.*" "cudf-cu12==24.10.*" \ + "cuml-cu12==24.10.*" "cugraph-cu12==24.10.*" \ + cupy-cuda12x + # Cellpose (for the nucleus pre-segmentation) and the geometry stack + # used to convert masks <-> polygons and to rasterize the final labels. + - type: python + pypi: + - cellpose + - rasterio>=1.3 + - geopandas + - shapely + - scikit-image + # Pre-fetch the Cellpose model weights so runtime doesn't hit the + # network inside the Nextflow worker. + - type: python + script: | + from cellpose.models import CellposeModel + CellposeModel() + # Segger trunk. The component synthesizes a Xenium-layout directory + # (transcripts.parquet + cell/nucleus_boundaries.parquet) from the + # SpatialData input and hands it to `segger segment`, so we do NOT + # need the spatialdata-loader branch — segger@main is enough. + - type: docker + run: | + pip install --no-cache-dir \ + "git+https://github.com/dpeerlab/segger.git@main#egg=segger" + __merge__: /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [hightime, midcpu, veryhighmem, gpu] diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py new file mode 100644 index 0000000..b447d96 --- /dev/null +++ b/src/methods/segger/script.py @@ -0,0 +1,347 @@ +import os +import shutil +import subprocess +from collections import Counter +from pathlib import Path + +import anndata as ad +import geopandas as gpd +import numpy as np +import pandas as pd +import polars as pl +import spatialdata as sd +import torch +import xarray as xr +from shapely.geometry import MultiPoint, Polygon +from spatialdata.models import Labels2DModel +from spatialdata.transformations import get_transformation + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print("Using device:", device, flush=True) + +## VIASH START +par = { + "input": "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr", + "output": "prediction.zarr", + "init_segmentation": "auto", + "n_epochs": 1, + "prediction_scale_factor": 2.2, + "prediction_mode": "nucleus", + "min_similarity": 0.0, + "fragment_mode": False, + "cellpose_diameter": 30.0, +} +meta = {"name": "segger", "temp_dir": "/tmp"} +## VIASH END + + +def _to_lower_uint(arr: np.ndarray) -> np.ndarray: + m = int(arr.max()) if arr.size else 0 + for dtype in (np.uint8, np.uint16, np.uint32, np.uint64): + if m <= np.iinfo(dtype).max: + return arr.astype(dtype) + return arr.astype(np.uint64) + + +def _affine_global_to_pixel(image_element) -> np.ndarray: + """3x3 homogeneous matrix mapping (x_global, y_global, 1) -> (x_pixel, y_pixel, 1).""" + t = get_transformation(image_element, "global") + M = np.asarray( + t.to_affine_matrix(input_axes=("x", "y"), output_axes=("x", "y")) + ) + return np.linalg.inv(M) + + +def _apply_affine(M: np.ndarray, xy: np.ndarray) -> np.ndarray: + homog = np.column_stack([xy, np.ones(len(xy))]) + return (homog @ M.T)[:, :2] + + +def _polygons_from_cell_ids(tx_pd: pd.DataFrame) -> gpd.GeoDataFrame: + """Convex hull per prior cell id value, in global coordinates.""" + if "cell_id" not in tx_pd.columns: + raise ValueError("transcripts table has no `cell_id` column") + records = [] + valid = tx_pd[tx_pd["cell_id"].notna() & (tx_pd["cell_id"].astype(str) != "")] + for cid, group in valid.groupby("cell_id"): + if len(group) < 3: + continue + pts = group[["x", "y"]].to_numpy() + hull = MultiPoint(pts).convex_hull + if hull.geom_type != "Polygon": + continue + records.append({"cell_id": cid, "geometry": hull}) + if not records: + raise ValueError("No usable cell_id groups (need >=3 transcripts each)") + return gpd.GeoDataFrame(records, geometry="geometry") + + +def _polygons_from_cellpose(image_element, diameter: float) -> tuple[np.ndarray, gpd.GeoDataFrame]: + """Run Cellpose on the morphology image and polygonize the masks. + + Returns the pixel-space label image and a GeoDataFrame of polygons in + global coordinates (so segger consumes them through the SpatialData + affine, just like Xenium's native nucleus_boundaries). + """ + from cellpose.models import CellposeModel + from rasterio.features import shapes as rio_shapes + + arr = image_element.compute().to_numpy() + if arr.ndim == 3: + arr = arr[0] + print(f"Cellpose input image shape: {arr.shape}", flush=True) + model = CellposeModel(gpu=torch.cuda.is_available()) + masks, _, _ = model.eval(arr, diameter=diameter, niter=10, flow_threshold=0, min_size=-1, resample=False) + masks = masks.astype(np.int32) + + # Polygonize. shapes() yields (geom_dict, label) for connected regions. + M_px_to_global = np.linalg.inv(_affine_global_to_pixel(image_element)) + records = [] + for geom_dict, label in rio_shapes(masks, mask=masks > 0): + coords_px = np.asarray(geom_dict["coordinates"][0]) # exterior ring + if len(coords_px) < 4: + continue + coords_global = _apply_affine(M_px_to_global, coords_px) + poly = Polygon(coords_global) + if not poly.is_valid or poly.area == 0: + continue + records.append({"cell_id": int(label), "geometry": poly}) + if not records: + raise RuntimeError("Cellpose produced no usable masks") + gdf = gpd.GeoDataFrame(records, geometry="geometry") + return masks, gdf + + +def _rasterize_polygons( + gdf: gpd.GeoDataFrame, image_element, label_col: str = "cell_id" +) -> np.ndarray: + """Rasterize global-coord polygons onto the morphology image grid.""" + from skimage.draw import polygon as draw_polygon + + H, W = image_element.shape[-2:] + M_g2p = _affine_global_to_pixel(image_element) + labels = np.zeros((H, W), dtype=np.uint32) + for cid, geom in zip(gdf[label_col].to_numpy(), gdf.geometry.to_numpy()): + if geom is None or geom.is_empty or geom.geom_type != "Polygon": + continue + ring = np.asarray(geom.exterior.coords) + pix = _apply_affine(M_g2p, ring) + rr, cc = draw_polygon(pix[:, 1], pix[:, 0], shape=labels.shape) + labels[rr, cc] = int(cid) + return labels + + +def _shapes_to_xenium_vertices(shapes_gdf: gpd.GeoDataFrame) -> pl.DataFrame: + """Xenium boundary parquet schema: one row per polygon vertex with + columns (cell_id, vertex_x, vertex_y).""" + rows = [] + for cid, geom in zip(shapes_gdf["cell_id"].to_numpy(), shapes_gdf.geometry.to_numpy()): + if geom is None or geom.is_empty or geom.geom_type != "Polygon": + continue + for vx, vy in np.asarray(geom.exterior.coords): + rows.append((int(cid), float(vx), float(vy))) + return pl.DataFrame(rows, schema=["cell_id", "vertex_x", "vertex_y"], orient="row") + + +def _initial_cell_id_per_transcript( + tx_pd: pd.DataFrame, initial_labels: np.ndarray, image_element +) -> np.ndarray: + """For each transcript, return the integer id of the initial mask region + it lands in, or 0 (segger's UNASSIGNED) when it falls on background.""" + H, W = initial_labels.shape + M = _affine_global_to_pixel(image_element) + xy_px = _apply_affine(M, tx_pd[["x", "y"]].to_numpy()) + ys = np.clip(np.round(xy_px[:, 1]).astype(int), 0, H - 1) + xs = np.clip(np.round(xy_px[:, 0]).astype(int), 0, W - 1) + return initial_labels[ys, xs].astype(np.int64) + + +def _build_xenium_layout( + tx_pd: pd.DataFrame, + shapes_gdf: gpd.GeoDataFrame, + initial_labels: np.ndarray, + image_element, + out_dir: Path, +) -> Path: + """Materialize a directory that segger's `10x_xenium` preprocessor can + consume: `transcripts.parquet` + matching `cell_boundaries.parquet` / + `nucleus_boundaries.parquet`. We don't have a real cell vs nucleus split, + so the same polygon set is written to both — segger then drives the + prediction graph off whichever one `--prediction-mode` selects.""" + out_dir.mkdir(parents=True, exist_ok=True) + + tx_init = _initial_cell_id_per_transcript(tx_pd, initial_labels, image_element) + cell_id_str = np.where(tx_init > 0, tx_init.astype(str), "UNASSIGNED") + + tx_out = pd.DataFrame({ + "transcript_id": ( + tx_pd["transcript_id"].to_numpy() + if "transcript_id" in tx_pd.columns + else np.arange(len(tx_pd), dtype=np.int64) + ), + "x_location": tx_pd["x"].to_numpy().astype(np.float32), + "y_location": tx_pd["y"].to_numpy().astype(np.float32), + "feature_name": tx_pd["feature_name"].astype(str).to_numpy(), + "cell_id": cell_id_str, + "qv": ( + tx_pd["qv"].to_numpy().astype(np.float32) + if "qv" in tx_pd.columns + else np.full(len(tx_pd), 40.0, dtype=np.float32) + ), + # Treat the entire mask as nucleus (overlaps_nucleus=1); transcripts + # off the mask get 0. segger only uses this when prediction_mode + # distinguishes cell vs nucleus. + "overlaps_nucleus": (tx_init > 0).astype(np.int8), + }) + if "z" in tx_pd.columns: + tx_out.insert(3, "z_location", tx_pd["z"].to_numpy().astype(np.float32)) + tx_out.to_parquet(out_dir / "transcripts.parquet", index=False) + + verts = _shapes_to_xenium_vertices(shapes_gdf) + verts.write_parquet(out_dir / "nucleus_boundaries.parquet") + verts.write_parquet(out_dir / "cell_boundaries.parquet") + return out_dir + + +def _run_segger(xenium_dir: Path, output_dir: Path) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + cmd = [ + "segger", "segment", + "-i", str(xenium_dir), + "-o", str(output_dir), + "--n-epochs", str(par["n_epochs"]), + "--prediction-scale-factor", str(par["prediction_scale_factor"]), + "--prediction-mode", par["prediction_mode"], + ] + if par.get("min_similarity") is not None and par["min_similarity"] > 0: + cmd += ["--min-similarity", str(par["min_similarity"])] + if par.get("fragment_mode"): + cmd += ["--fragment-mode"] + print("Running segger:", " ".join(cmd), flush=True) + subprocess.run(cmd, check=True) + pq = output_dir / "segger_segmentation.parquet" + if not pq.exists(): + raise RuntimeError(f"Expected segger output not found: {pq}") + return pq + + +def _relabel_initial_mask( + initial_labels: np.ndarray, + tx_pixel_xy: np.ndarray, + tx_segger_cell: np.ndarray, +) -> np.ndarray: + """For each label in `initial_labels`, replace it by the majority segger + cell id of the transcripts that fall on top of it. Pixels whose initial + label has no covering transcripts are left at 0 (background).""" + H, W = initial_labels.shape + ys = np.clip(np.round(tx_pixel_xy[:, 1]).astype(int), 0, H - 1) + xs = np.clip(np.round(tx_pixel_xy[:, 0]).astype(int), 0, W - 1) + tx_init = initial_labels[ys, xs] + relabel = {0: 0} + for init_id in np.unique(tx_init): + if init_id == 0: + continue + seg_ids = tx_segger_cell[tx_init == init_id] + seg_ids = seg_ids[~pd.isna(seg_ids)] + if seg_ids.size == 0: + continue + most = Counter(seg_ids.astype(np.int64).tolist()).most_common(1)[0][0] + relabel[int(init_id)] = int(most) + out = np.zeros_like(initial_labels, dtype=np.int64) + for k, v in relabel.items(): + out[initial_labels == k] = v + return out + + +# ----------------------------- main -------------------------------- # + +input_path = Path(par["input"]) +output_path = Path(par["output"]) +work_root = Path(meta.get("temp_dir") or "/tmp") / f"segger_{os.getpid()}" +work_root.mkdir(parents=True, exist_ok=True) +xenium_dir = work_root / "xenium_input" +segger_out_dir = work_root / "segger_output" + +print(f"Reading input: {input_path}", flush=True) +sdata = sd.read_zarr(str(input_path)) +image_el = sdata["morphology_mip"]["scale0"].image +image_transform = image_el.transform.copy() + +# Bring transcripts into a pandas frame once (we need them again for relabeling). +tx_pd = sdata.points["transcripts"].compute().reset_index(drop=True) + +# --- Step 1: produce initial boundaries --- +# Use the vendor `cell_id` prior on the transcripts when present; otherwise +# run Cellpose on the morphology image. +has_prior = "cell_id" in tx_pd.columns and tx_pd["cell_id"].notna().any() +mode = par["init_segmentation"] +if mode == "auto": + mode = "transcript_cell_id" if has_prior else "cellpose" +print(f"Init segmentation mode: {mode}", flush=True) + +initial_labels: np.ndarray +shapes_gdf: gpd.GeoDataFrame +if mode == "transcript_cell_id": + shapes_gdf = _polygons_from_cell_ids(tx_pd) + initial_labels = _rasterize_polygons(shapes_gdf, image_el, label_col="cell_id") +elif mode == "cellpose": + initial_labels, shapes_gdf = _polygons_from_cellpose(image_el, par["cellpose_diameter"]) +else: + raise ValueError(f"Unknown init_segmentation mode: {mode}") +print(f"Initial segmentation: {len(shapes_gdf)} polygons", flush=True) + +# --- Step 2: materialize a Xenium-layout dir and run segger --- +# segger@main only knows how to consume Xenium/Merscope/CosMx directories, +# so we synthesize a minimal Xenium dataset from the SpatialData input and +# our just-computed nucleus polygons. transcripts.parquet carries the +# initial mask id as `cell_id` (segger's training signal), and the polygons +# are written to both cell_boundaries.parquet and nucleus_boundaries.parquet +# since segger expects both Xenium files to exist. +_build_xenium_layout(tx_pd, shapes_gdf, initial_labels, image_el, xenium_dir) +seg_pq = _run_segger(xenium_dir, segger_out_dir) +seg = pl.read_parquet(seg_pq) +print(f"segger emitted {seg.height} rows", flush=True) + +# Keep only confident assignments +seg = seg.filter(pl.col("keep") & pl.col("segger_cell_id").is_not_null()) +print(f"kept assignments: {seg.height}", flush=True) +if seg.height == 0: + print("WARNING: segger kept no transcript assignments — falling back to the initial mask.", flush=True) + final_labels = initial_labels.astype(np.int64) +else: + row_idx = seg["row_index"].to_numpy() + segger_cell = seg["segger_cell_id"].to_numpy() + xy_global = tx_pd.loc[row_idx, ["x", "y"]].to_numpy() + xy_pixel = _apply_affine(_affine_global_to_pixel(image_el), xy_global) + final_labels = _relabel_initial_mask(initial_labels, xy_pixel, segger_cell) + +final_labels = _to_lower_uint(final_labels) + +# --- Step 3: write the SpatialData prediction --- +sd_out = sd.SpatialData( + labels={ + "segmentation": Labels2DModel.parse( + xr.DataArray(final_labels, name="segmentation", dims=("y", "x")), + transformations=image_transform, + ), + }, + tables={ + "table": ad.AnnData( + uns={ + "dataset_id": sdata.tables["table"].uns["dataset_id"], + "method_id": meta["name"], + } + ), + }, +) + +print(f"Saving output: {output_path}", flush=True) +if output_path.exists(): + shutil.rmtree(output_path) +sd_out.write(str(output_path)) + +# best-effort cleanup so the worker doesn't accumulate state across runs +try: + shutil.rmtree(work_root) +except OSError as e: + print(f"(non-fatal) cleanup of {work_root} failed: {e}", flush=True) From c2a3268a80913123e20cd8c406c79e06fa6d1275 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Mon, 18 May 2026 13:57:36 +0200 Subject: [PATCH 03/26] Apply suggestions from code review Co-authored-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index f28d0f0..cf30cf0 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -108,9 +108,7 @@ engines: # Pre-fetch the Cellpose model weights so runtime doesn't hit the # network inside the Nextflow worker. - type: python - script: | - from cellpose.models import CellposeModel - CellposeModel() + script: "from cellpose.models import CellposeModel; CellposeModel()" # Segger trunk. The component synthesizes a Xenium-layout directory # (transcripts.parquet + cell/nucleus_boundaries.parquet) from the # SpatialData input and hands it to `segger segment`, so we do NOT From 3bc10eea4c28aaa65adc71e424d623a07f4a4062 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 09:15:21 +0200 Subject: [PATCH 04/26] Segger - use pytorch 25.04 instead of 26.02 as a base image (#33) * use pytorch 25.04 instead of 26.02 as a base image Signed-off-by: Robrecht Cannoodt * try with 25.06 Signed-off-by: Robrecht Cannoodt * update image location Signed-off-by: Robrecht Cannoodt * print format Signed-off-by: Robrecht Cannoodt --------- Signed-off-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 51 +++++++++++++++--------------- src/methods/segger/script.py | 3 +- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index cf30cf0..0ed7c4d 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -76,26 +76,23 @@ resources: engines: - type: docker - image: openproblems/base_pytorch_nvidia:1 + # copied from https://github.com/openproblems-bio/core/blob/main/base_images/src/pytorch_nvidia/config.vsh.yaml + image: nvcr.io/nvidia/pytorch:25.06-py3 setup: - # PyG extensions matching the base image's PyTorch 2.5 + CUDA 12.1 stack - # (per segger/environment_cuda121.yml on the integration/peerlab branch). - - type: docker - run: | - pip install --no-cache-dir \ - --find-links https://data.pyg.org/whl/torch-2.5.0+cu121.html \ - torch-scatter torch-sparse torch-geometric lightning - # RAPIDS 24.10 — the last release that is binary-compatible with the - # CUDA 12.1 runtime that the base image ships. Newer 24.12/25.x wheels - # require CUDA 12.5+ and segfault on this image. Optional at runtime - # (segger has CPU fallbacks) but required for the prediction graph - # KD-tree to run on GPU at realistic dataset sizes. - - type: docker - run: | - pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com \ - "cuspatial-cu12==24.10.*" "cudf-cu12==24.10.*" \ - "cuml-cu12==24.10.*" "cugraph-cu12==24.10.*" \ - cupy-cuda12x + - type: apt + packages: + - procps + - git + - type: python + packages: + - anndata~=0.12.0 + - scanpy~=1.12 + - requests + - jsonschema + - spatialdata>=0.7.3 + - zarr>=3.0.0 + github: + - openproblems-bio/core#subdirectory=packages/python/openproblems # Cellpose (for the nucleus pre-segmentation) and the geometry stack # used to convert masks <-> polygons and to rasterize the final labels. - type: python @@ -107,17 +104,19 @@ engines: - scikit-image # Pre-fetch the Cellpose model weights so runtime doesn't hit the # network inside the Nextflow worker. - - type: python - script: "from cellpose.models import CellposeModel; CellposeModel()" + # - type: python + # script: "from cellpose.models import CellposeModel; CellposeModel()" # Segger trunk. The component synthesizes a Xenium-layout directory # (transcripts.parquet + cell/nucleus_boundaries.parquet) from the # SpatialData input and hands it to `segger segment`, so we do NOT # need the spatialdata-loader branch — segger@main is enough. - - type: docker - run: | - pip install --no-cache-dir \ - "git+https://github.com/dpeerlab/segger.git@main#egg=segger" - __merge__: /src/base/setup_spatialdata_partial.yaml + - type: python + github: dpeerlab/segger + # - type: docker + # run: | + # pip install --no-cache-dir \ + # "git+https://github.com/dpeerlab/segger.git@main#egg=segger" + # __merge__: /src/base/setup_spatialdata_partial.yaml - type: native runners: diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index b447d96..be5b5eb 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -264,7 +264,8 @@ def _relabel_initial_mask( print(f"Reading input: {input_path}", flush=True) sdata = sd.read_zarr(str(input_path)) -image_el = sdata["morphology_mip"]["scale0"].image +print("Input: ", sdata, flush=True) +image_el = sdata["image"]["scale0"].image image_transform = image_el.transform.copy() # Bring transcripts into a pandas frame once (we need them again for relabeling). From 5f74b6ac7f5483f5177f1c682cae632605caa14c Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 09:20:27 +0200 Subject: [PATCH 05/26] Fix segger CI: replace cellpose min_size=-1 with 0 for NumPy 2.x The new base image (nvcr.io/nvidia/pytorch:25.06-py3 from PR #33) ships NumPy 2.x, which no longer silently overflows -1 into uint32. Cellpose internally compares mask sizes (a uint32 array) against min_size, so passing -1 now raises `OverflowError: Python integer -1 out of bounds for uint32`. Modern cellpose treats min_size=0 as "keep all masks", matching the original intent. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index be5b5eb..9a40b95 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -91,7 +91,7 @@ def _polygons_from_cellpose(image_element, diameter: float) -> tuple[np.ndarray, arr = arr[0] print(f"Cellpose input image shape: {arr.shape}", flush=True) model = CellposeModel(gpu=torch.cuda.is_available()) - masks, _, _ = model.eval(arr, diameter=diameter, niter=10, flow_threshold=0, min_size=-1, resample=False) + masks, _, _ = model.eval(arr, diameter=diameter, niter=10, flow_threshold=0, min_size=0, resample=False) masks = masks.astype(np.int32) # Polygonize. shapes() yields (geom_dict, label) for connected regions. From 1c10f123daeb00359398e7d33a825f638a7a91f8 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 09:28:39 +0200 Subject: [PATCH 06/26] Filter -1 / UNASSIGNED cell_id sentinels before rasterization Test data carries the vendor cell_id prior with -1 marking unassigned transcripts. The old filter (notna + non-empty-string) treated -1 as a real cell id, built a convex hull over every unassigned transcript, and then `labels[rr, cc] = int(-1)` blew up on uint32 under NumPy 2.x with `OverflowError: Python integer -1 out of bounds for uint32`. Add `_valid_cell_id_mask`, the pandas analog of segger's canonical `valid_cell_id_expr` (rejects null / -1 / "UNASSIGNED" / "NONE"), and apply the same predicate to segger's own output before majority-voting labels, since segger emits -1 for unassigned transcripts too. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 9a40b95..8b49c2c 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -35,6 +35,20 @@ ## VIASH END +_UNASSIGNED_CELL_ID_STRINGS = {"-1", "UNASSIGNED", "NONE", "NAN", ""} + + +def _valid_cell_id_mask(cell_id: pd.Series) -> pd.Series: + """Pandas analog of segger.validation.quick_metrics.valid_cell_id_expr: + reject null, -1, "UNASSIGNED", "NONE" (case-insensitive). Keeps the + "is this transcript assigned?" predicate in sync with the upstream + segger pipeline so the same sentinels are treated identically here.""" + if pd.api.types.is_numeric_dtype(cell_id): + return cell_id.notna() & (cell_id != -1) + s = cell_id.astype(str).str.strip().str.upper() + return cell_id.notna() & ~s.isin(_UNASSIGNED_CELL_ID_STRINGS) + + def _to_lower_uint(arr: np.ndarray) -> np.ndarray: m = int(arr.max()) if arr.size else 0 for dtype in (np.uint8, np.uint16, np.uint32, np.uint64): @@ -62,7 +76,7 @@ def _polygons_from_cell_ids(tx_pd: pd.DataFrame) -> gpd.GeoDataFrame: if "cell_id" not in tx_pd.columns: raise ValueError("transcripts table has no `cell_id` column") records = [] - valid = tx_pd[tx_pd["cell_id"].notna() & (tx_pd["cell_id"].astype(str) != "")] + valid = tx_pd[_valid_cell_id_mask(tx_pd["cell_id"])] for cid, group in valid.groupby("cell_id"): if len(group) < 3: continue @@ -303,8 +317,18 @@ def _relabel_initial_mask( seg = pl.read_parquet(seg_pq) print(f"segger emitted {seg.height} rows", flush=True) -# Keep only confident assignments -seg = seg.filter(pl.col("keep") & pl.col("segger_cell_id").is_not_null()) +# Keep only confident assignments. Mirror segger's canonical +# valid_cell_id_expr (null / "-1" / "UNASSIGNED" / "NONE") so that the +# -1 unassigned sentinel segger emits doesn't reach _relabel_initial_mask +# and ultimately the uint32 labels image. +_seg_id_str = pl.col("segger_cell_id").cast(pl.Utf8).str.to_uppercase() +seg = seg.filter( + pl.col("keep") + & pl.col("segger_cell_id").is_not_null() + & (_seg_id_str != "-1") + & (_seg_id_str != "UNASSIGNED") + & (_seg_id_str != "NONE") +) print(f"kept assignments: {seg.height}", flush=True) if seg.height == 0: print("WARNING: segger kept no transcript assignments — falling back to the initial mask.", flush=True) From 11d203c208d6835f403a825fab193ee6e5eba0da Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 10:07:53 +0200 Subject: [PATCH 07/26] Align segger CLI flags with dpeerlab/segger@main MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The component installs segger from dpeerlab/segger's default branch (main), whose `segment` CLI no longer accepts --prediction-scale-factor, --min-similarity, or --fragment-mode (those live on v2-incremental). Rename our --prediction_scale_factor arg to --prediction_expansion_ratio to match upstream (`buffer_dist = sqrt(area/pi) * ratio`, so 2.2 is no longer a sensible default — drop it to 0.5). Drop --min_similarity and --fragment_mode since there is nothing to forward them to. Bump n_epochs default to 20. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/config.vsh.yaml | 25 +++++++++---------------- src/methods/segger/script.py | 12 +++--------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 0ed7c4d..a201635 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -42,28 +42,21 @@ arguments: test_default: auto - name: --n_epochs type: integer + default: 20 description: "Number of training epochs for the GNN." - info: - test_default: 1 - - name: --prediction_scale_factor + - name: --prediction_expansion_ratio type: double - description: "Multiplier applied to nucleus polygons when forming transcript-to-boundary edges at prediction time. Values >1 expand the polygon to capture peripheral transcripts." - info: - test_default: 2.2 + default: 0.5 + description: | + Fraction of each polygon's equivalent radius used to expand it + when building the transcript-to-boundary prediction graph (forwarded + as `--prediction-expansion-ratio` to `segger segment`). Larger values + reach more peripheral transcripts. - name: --prediction_mode type: string choices: [nucleus, cell, uniform] + default: nucleus description: "Which polygon set drives the prediction graph." - info: - test_default: nucleus - - name: --min_similarity - type: double - description: "Fixed similarity threshold in [0,1] for keeping a transcript-boundary assignment. If unset, segger uses per-gene Li+Yen thresholds." - info: - test_default: 0.0 - - name: --fragment_mode - type: boolean_true - description: "Run fragment mode to group otherwise-unassigned transcripts into connected components." - name: --cellpose_diameter type: double description: "Cell diameter (pixels) for the Cellpose nucleus pre-segmentation." diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 8b49c2c..8253e5f 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -24,11 +24,9 @@ "input": "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr", "output": "prediction.zarr", "init_segmentation": "auto", - "n_epochs": 1, - "prediction_scale_factor": 2.2, + "n_epochs": 20, + "prediction_expansion_ratio": 0.5, "prediction_mode": "nucleus", - "min_similarity": 0.0, - "fragment_mode": False, "cellpose_diameter": 30.0, } meta = {"name": "segger", "temp_dir": "/tmp"} @@ -224,13 +222,9 @@ def _run_segger(xenium_dir: Path, output_dir: Path) -> Path: "-i", str(xenium_dir), "-o", str(output_dir), "--n-epochs", str(par["n_epochs"]), - "--prediction-scale-factor", str(par["prediction_scale_factor"]), + "--prediction-expansion-ratio", str(par["prediction_expansion_ratio"]), "--prediction-mode", par["prediction_mode"], ] - if par.get("min_similarity") is not None and par["min_similarity"] > 0: - cmd += ["--min-similarity", str(par["min_similarity"])] - if par.get("fragment_mode"): - cmd += ["--fragment-mode"] print("Running segger:", " ".join(cmd), flush=True) subprocess.run(cmd, check=True) pq = output_dir / "segger_segmentation.parquet" From 770ca1be913209661bc8287cc7521853cf03fa0a Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 10:23:10 +0200 Subject: [PATCH 08/26] Skip cudf driver validation on GPU-less runners segger@main eagerly `import cudf` in `data/tiling.py`, and cudf's `validate_setup()` raises cudaErrorInsufficientDriver on CPU-only CI hosts before any of segger's documented CPU fallbacks get a chance to run. Forward RAPIDS_NO_INITIALIZE / CUDF_NO_INITIALIZE / RMM_NO_INITIALIZE to the `segger segment` subprocess so the import succeeds; real GPU operations still error if hit, but small test fixtures stay on CPU paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 8253e5f..3467c6a 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -225,8 +225,18 @@ def _run_segger(xenium_dir: Path, output_dir: Path) -> Path: "--prediction-expansion-ratio", str(par["prediction_expansion_ratio"]), "--prediction-mode", par["prediction_mode"], ] + # cudf's `validate_setup` aborts `import cudf` on hosts without a + # usable NVIDIA driver (cudaErrorInsufficientDriver). segger imports + # cudf eagerly in tiling.py, so the subprocess dies before it can + # use its documented CPU fallbacks. RAPIDS_NO_INITIALIZE / *_NO_INITIALIZE + # skip that validation; cudf functions still attempt GPU calls if + # invoked, but on small test fixtures segger stays on CPU paths. + env = os.environ.copy() + env.setdefault("RAPIDS_NO_INITIALIZE", "1") + env.setdefault("CUDF_NO_INITIALIZE", "1") + env.setdefault("RMM_NO_INITIALIZE", "1") print("Running segger:", " ".join(cmd), flush=True) - subprocess.run(cmd, check=True) + subprocess.run(cmd, check=True, env=env) pq = output_dir / "segger_segmentation.parquet" if not pq.exists(): raise RuntimeError(f"Expected segger output not found: {pq}") From b9ae3e5ed14e6ef02a40acadfdd47d98588926df Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 10:54:59 +0200 Subject: [PATCH 09/26] CPU stub-and-exit for segger + restore cuspatial install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit segger needs a real CUDA GPU end-to-end (cudf, cuspatial, kernels). The viash test matrix lands on a CPU-only GitHub runner, so we can't make the algorithm actually run there. Two complementary changes: * `script.py`: after reading the input, short-circuit on `torch.cuda.is_available() == False` by writing a valid all-zeros SpatialData stub (segmentation labels matching the input image shape + AnnData table with dataset_id / method_id) and exiting 0. The test passes; the resulting prediction obviously scores poorly, which the log calls out loudly. * `config.vsh.yaml`: re-add the `cuspatial-cu12` install that PR #33 dropped. NGC's pytorch:25.06-py3 bundles cudf but not cuspatial, and segger eagerly imports it from `segger.geometry.conversion`. No version pin — pip resolves against the bundled cudf. GPU hosts now go through the real pipeline; CPU CI never reaches the import path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/config.vsh.yaml | 8 ++++++ src/methods/segger/script.py | 40 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index a201635..6d1e5c6 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -86,6 +86,14 @@ engines: - zarr>=3.0.0 github: - openproblems-bio/core#subdirectory=packages/python/openproblems + # cuspatial is required by segger.geometry.conversion at import time + # and is NOT shipped in the NGC PyTorch base image (only cudf is). + # No version pin so pip resolves against whatever cudf the image + # already has installed. Only used by `segger segment` on GPU hosts; + # CPU CI runs short-circuit to a stub before this is needed. + - type: docker + run: | + pip install --no-cache-dir --extra-index-url=https://pypi.nvidia.com cuspatial-cu12 # Cellpose (for the nucleus pre-segmentation) and the geometry stack # used to convert masks <-> polygons and to rasterize the final labels. - type: python diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 3467c6a..645513d 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -1,6 +1,7 @@ import os import shutil import subprocess +import sys from collections import Counter from pathlib import Path @@ -286,6 +287,45 @@ def _relabel_initial_mask( image_el = sdata["image"]["scale0"].image image_transform = image_el.transform.copy() +# segger needs CUDA at runtime (cudf, cuspatial, GPU kernels). The viash +# test matrix runs on CPU-only GitHub runners alongside CPU components, +# so when no GPU is visible we emit a minimal valid SpatialData stub and +# exit successfully. Real benchmark runs (Nextflow `gpu` label) hit the +# full pipeline below. The stub is intentionally all-zeros so downstream +# metric scores will be poor — this is a "skip", not a "result". +if not torch.cuda.is_available(): + print( + "WARNING: no CUDA GPU detected; segger requires GPU. Writing an " + "empty segmentation stub and exiting (this is a CI skip, not a " + "real result).", + flush=True, + ) + H, W = image_el.shape[-2:] + stub = sd.SpatialData( + labels={ + "segmentation": Labels2DModel.parse( + xr.DataArray( + np.zeros((H, W), dtype=np.uint32), + name="segmentation", + dims=("y", "x"), + ), + transformations=image_transform, + ), + }, + tables={ + "table": ad.AnnData( + uns={ + "dataset_id": sdata.tables["table"].uns["dataset_id"], + "method_id": meta["name"], + } + ), + }, + ) + if output_path.exists(): + shutil.rmtree(output_path) + stub.write(str(output_path)) + sys.exit(0) + # Bring transcripts into a pandas frame once (we need them again for relabeling). tx_pd = sdata.points["transcripts"].compute().reset_index(drop=True) From d8765d3c711f57f743d197bdaa7d814e066caa10 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 10:57:25 +0200 Subject: [PATCH 10/26] Warn early when no CUDA GPU is detected for segger MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Promote the no-GPU heads-up to a `warnings.warn(UserWarning, ...)` emitted right after device detection — fires before the input is read so logs surface the situation as early as possible, on top of the existing in-flow print where the stub is written. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 645513d..cfe46b7 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -2,6 +2,7 @@ import shutil import subprocess import sys +import warnings from collections import Counter from pathlib import Path @@ -19,6 +20,15 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device, flush=True) +if device.type != "cuda": + warnings.warn( + "No CUDA GPU detected. segger requires a GPU end-to-end (cudf, " + "cuspatial, GPU kernels); the component will write an empty " + "segmentation stub and exit. Real benchmark runs must use a " + "GPU-equipped host.", + UserWarning, + stacklevel=2, + ) ## VIASH START par = { From 29d16d1ccdf7293ed12594d7cc926c619cfa2c46 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 11:09:47 +0200 Subject: [PATCH 11/26] Hard-fail segger when no CUDA GPU is available Reverts the CPU stub-and-exit path: segger now raises RuntimeError immediately after device detection if no CUDA GPU is visible. This makes CPU CI fail loudly (which is what we actually want for a GPU- only method) instead of silently emitting an all-zeros stub that could be mistaken for a real result. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 52 +++--------------------------------- 1 file changed, 4 insertions(+), 48 deletions(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index cfe46b7..18857f3 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -1,8 +1,6 @@ import os import shutil import subprocess -import sys -import warnings from collections import Counter from pathlib import Path @@ -21,13 +19,10 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device, flush=True) if device.type != "cuda": - warnings.warn( - "No CUDA GPU detected. segger requires a GPU end-to-end (cudf, " - "cuspatial, GPU kernels); the component will write an empty " - "segmentation stub and exit. Real benchmark runs must use a " - "GPU-equipped host.", - UserWarning, - stacklevel=2, + raise RuntimeError( + "segger requires a CUDA GPU end-to-end (cudf, cuspatial, GPU " + "kernels) and none is available. Run this component on a " + "GPU-equipped host." ) ## VIASH START @@ -297,45 +292,6 @@ def _relabel_initial_mask( image_el = sdata["image"]["scale0"].image image_transform = image_el.transform.copy() -# segger needs CUDA at runtime (cudf, cuspatial, GPU kernels). The viash -# test matrix runs on CPU-only GitHub runners alongside CPU components, -# so when no GPU is visible we emit a minimal valid SpatialData stub and -# exit successfully. Real benchmark runs (Nextflow `gpu` label) hit the -# full pipeline below. The stub is intentionally all-zeros so downstream -# metric scores will be poor — this is a "skip", not a "result". -if not torch.cuda.is_available(): - print( - "WARNING: no CUDA GPU detected; segger requires GPU. Writing an " - "empty segmentation stub and exiting (this is a CI skip, not a " - "real result).", - flush=True, - ) - H, W = image_el.shape[-2:] - stub = sd.SpatialData( - labels={ - "segmentation": Labels2DModel.parse( - xr.DataArray( - np.zeros((H, W), dtype=np.uint32), - name="segmentation", - dims=("y", "x"), - ), - transformations=image_transform, - ), - }, - tables={ - "table": ad.AnnData( - uns={ - "dataset_id": sdata.tables["table"].uns["dataset_id"], - "method_id": meta["name"], - } - ), - }, - ) - if output_path.exists(): - shutil.rmtree(output_path) - stub.write(str(output_path)) - sys.exit(0) - # Bring transcripts into a pandas frame once (we need them again for relabeling). tx_pd = sdata.points["transcripts"].compute().reset_index(drop=True) From bbb5f305dcc51dd52b9c83702d790b8060dbf848 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:33:36 +0200 Subject: [PATCH 12/26] bypass run test for segger Signed-off-by: Robrecht Cannoodt --- src/api/method_gpu.yaml | 20 ++++++++++++++++++++ src/methods/segger/config.vsh.yaml | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 src/api/method_gpu.yaml diff --git a/src/api/method_gpu.yaml b/src/api/method_gpu.yaml new file mode 100644 index 0000000..cb12389 --- /dev/null +++ b/src/api/method_gpu.yaml @@ -0,0 +1,20 @@ +namespace: "methods" +info: + type: method + type_info: + label: Method + summary: A method. + description: | + A method to predict the task effects. +arguments: + - name: --input + __merge__: file_spatial_unlabelled.yaml + required: true + direction: input + - name: --output + __merge__: file_prediction.yaml + required: true + direction: output +test_resources: + - type: python_script + path: /common/component_tests/check_config.py diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 6d1e5c6..e34dafa 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: /src/api/comp_method.yaml +__merge__: /src/api/method_gpu.yaml name: segger From c520074645c09af624bb31b966e5779481d250ad Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:35:22 +0200 Subject: [PATCH 13/26] add segger to wf Signed-off-by: Robrecht Cannoodt --- src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index a999117..99f09a1 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -67,6 +67,7 @@ dependencies: - name: methods/cellpose - name: methods/stardist - name: methods/proseg + - name: methods/segger - name: data_processors/cell_type_annotation_tacco - name: metrics/ari - name: data_processors/process_prediction diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 368a954..eb24417 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -12,7 +12,8 @@ methods = [ random_voronoi, cellpose, stardist, - proseg + proseg, + segger ] // construct list of metrics From 8c8560ab0bb24e620432d74baf6fb0e9eda168b0 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 11:43:15 +0200 Subject: [PATCH 14/26] use different label Signed-off-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index e34dafa..1f33866 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -124,4 +124,4 @@ runners: - type: executable - type: nextflow directives: - label: [hightime, midcpu, veryhighmem, gpu] + label: [hightime, midcpu, highmem, gpu] From 3c2e808e1bb965642b69b57363910cb7bd55ab9a Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 12:08:20 +0200 Subject: [PATCH 15/26] Export segger's full per-cell footprint, boundaries, and per-cell table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the older majority-vote-over-initial-nuclei label image with the team's canonical Delaunay alpha-shape per cell (vendored from dpeerlab/segger@v2-incremental, see boundary.py). The model itself stays on dpeerlab/segger@main — only the export side is uplifted to v2's boundary builder so the rest of the lab's tooling (segger-analysis fov_analysis, plot_vd_pixel_method_masks.py) renders the same masks. The prediction.zarr now contains, all in segger's native semantics: * labels.segmentation — uint label image rasterized from each cell's smooth Delaunay boundary, so every transcript segger assigned (including ones rescued outside the initial nucleus) falls inside its cell's mask. * shapes.cell_boundaries — a GeoDataFrame of the same per-cell polygons in global coordinates so downstream morphology / FOV analyses can use them directly without re-rasterizing. * tables.table — a proper cells x genes AnnData with per-cell counts derived from segger's transcript-level assignments (not from a post-hoc pixel lookup), uns.dataset_id, uns.method_id, and a counts layer. process_prediction will still rederive its own table from labels.segmentation downstream; this one is for direct inspection. `generate_boundaries` is wrapped in a 16 -> 8 -> 4 -> 2 worker fallback so saturated or low-core hosts don't make the export step fail. If the boundary builder still fails or returns nothing usable, the script degrades gracefully to the initial nucleus mask for the label image. rtree + tqdm added to the python pypi deps for the vendored boundary module. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/boundary.py | 536 +++++++++++++++++++++++++++++ src/methods/segger/config.vsh.yaml | 7 + src/methods/segger/script.py | 231 ++++++++++--- 3 files changed, 732 insertions(+), 42 deletions(-) create mode 100644 src/methods/segger/boundary.py diff --git a/src/methods/segger/boundary.py b/src/methods/segger/boundary.py new file mode 100644 index 0000000..4bfb499 --- /dev/null +++ b/src/methods/segger/boundary.py @@ -0,0 +1,536 @@ +"""Delaunay triangulation-based cell boundary generation. + +This module provides sophisticated boundary extraction using Delaunay triangulation +with iterative edge refinement and cycle detection. This produces more accurate +cell boundaries than simple convex hulls. + +VENDORED from dpeerlab/segger @ branch `v2-incremental`, +path `src/segger/export/boundary.py`. The trunk model on `main` does not +ship this module, but the team's analysis pipeline (segger-analysis +notebooks/fov_analysis, e.g. `plot_vd_pixel_method_masks.py`) uses +`generate_boundaries` as the canonical smooth-mask builder. Vendoring +keeps the model install on `main` (per request) while still rendering +segger's per-cell masks the way the rest of the lab's tooling expects. + +Treat this file as upstream — if it needs to change, change v2-incremental +and re-copy. Do NOT hand-edit. +""" + +from typing import Iterable, Tuple, Union +from concurrent.futures import ThreadPoolExecutor +import geopandas as gpd +import numpy as np +import pandas as pd +import polars as pl +import rtree.index +from scipy.spatial import Delaunay +from shapely.geometry import MultiPolygon, Polygon +from tqdm import tqdm + + +def vector_angle(v1: np.ndarray, v2: np.ndarray) -> float: + """Calculate angle between two vectors in degrees. + + Parameters + ---------- + v1 : np.ndarray + First vector. + v2 : np.ndarray + Second vector. + + Returns + ------- + float + Angle in degrees. + """ + dot_product = np.dot(v1, v2) + magnitude_v1 = np.linalg.norm(v1) + magnitude_v2 = np.linalg.norm(v2) + cos_angle = np.clip(dot_product / (magnitude_v1 * magnitude_v2 + 1e-8), -1.0, 1.0) + return np.degrees(np.arccos(cos_angle)) + + +def triangle_angles_from_points( + points: np.ndarray, + triangles: np.ndarray, +) -> np.ndarray: + """Calculate angles for all triangles in a Delaunay triangulation. + + Parameters + ---------- + points : np.ndarray + Point coordinates, shape (N, 2). + triangles : np.ndarray + Triangle vertex indices, shape (M, 3). + + Returns + ------- + np.ndarray + Angles for each triangle vertex, shape (M, 3). + """ + # Vectorized angle computation for all triangles + p1 = points[triangles[:, 0]] + p2 = points[triangles[:, 1]] + p3 = points[triangles[:, 2]] + + v1 = p2 - p1 + v2 = p3 - p1 + v3 = p3 - p2 + + def _angles(u: np.ndarray, v: np.ndarray) -> np.ndarray: + dot = (u * v).sum(axis=1) + denom = (np.linalg.norm(u, axis=1) * np.linalg.norm(v, axis=1)) + 1e-8 + cos = np.clip(dot / denom, -1.0, 1.0) + return np.degrees(np.arccos(cos)) + + a = _angles(v1, v2) + b = _angles(-v1, v3) + c = _angles(-v2, -v3) + return np.stack([a, b, c], axis=1) + + +def dfs(v: int, graph: dict, path: list, colors: dict) -> None: + """Depth-first search for cycle detection. + + Parameters + ---------- + v : int + Current vertex. + graph : dict + Adjacency list representation of graph. + path : list + Current path being built. + colors : dict + Vertex visit status (0=unvisited, 1=visited). + """ + colors[v] = 1 + path.append(v) + for d in graph[v]: + if colors[d] == 0: + dfs(d, graph, path, colors) + + +class BoundaryIdentification: + """Delaunay triangulation-based polygon boundary extraction. + + This class implements a two-phase iterative algorithm for extracting + cell boundaries from transcript point clouds: + + 1. Phase 1: Remove long boundary edges (> 2 * d_max) + 2. Phase 2: Remove boundary edges with extreme angles + + Parameters + ---------- + data : np.ndarray + 2D point coordinates, shape (N, 2). + """ + + def __init__(self, data: np.ndarray): + self.graph = None + self.edges = {} + self.d = Delaunay(data) + self.d_max = self.calculate_d_max(self.d.points) + self.generate_edges() + + def generate_edges(self) -> None: + """Generate edge dictionary from Delaunay triangulation.""" + d = self.d + edges = {} + angles = triangle_angles_from_points(d.points, d.simplices) + + for index, simplex in enumerate(d.simplices): + for p in range(3): + edge = tuple(sorted((simplex[p], simplex[(p + 1) % 3]))) + if edge not in edges: + edges[edge] = {"simplices": {}} + edges[edge]["simplices"][index] = angles[index][(p + 2) % 3] + + edges_coordinates = d.points[np.array(list(edges.keys()))] + edges_length = np.sqrt( + (edges_coordinates[:, 1, 0] - edges_coordinates[:, 0, 0]) ** 2 + + (edges_coordinates[:, 1, 1] - edges_coordinates[:, 0, 1]) ** 2 + ) + + for edge, coords, length in zip(edges, edges_coordinates, edges_length): + edges[edge]["coords"] = coords + edges[edge]["length"] = length + + self.edges = edges + + def calculate_part_1(self, plot: bool = False) -> None: + """Phase 1: Remove long boundary edges iteratively. + + Removes edges longer than 2 * d_max from the boundary. + + Parameters + ---------- + plot : bool + Whether to generate visualization (not implemented). + """ + edges = self.edges + d = self.d + d_max = self.d_max + + boundary_edges = [edge for edge in edges if len(edges[edge]["simplices"]) < 2] + + flag = True + while flag: + flag = False + next_boundary_edges = [] + + for current_edge in boundary_edges: + if current_edge not in edges: + continue + + if edges[current_edge]["length"] > 2 * d_max: + if len(edges[current_edge]["simplices"].keys()) == 0: + del edges[current_edge] + continue + + simplex_id = list(edges[current_edge]["simplices"].keys())[0] + simplex = d.simplices[simplex_id] + + for edge in self.get_edges_from_simplex(simplex): + if edge != current_edge: + edges[edge]["simplices"].pop(simplex_id) + next_boundary_edges.append(edge) + + del edges[current_edge] + flag = True + else: + next_boundary_edges.append(current_edge) + + boundary_edges = next_boundary_edges + + def calculate_part_2(self, plot: bool = False) -> None: + """Phase 2: Remove boundary edges with extreme angles. + + Removes edges where the opposite angle is too large, indicating + a concave region that should be excluded. + + Parameters + ---------- + plot : bool + Whether to generate visualization (not implemented). + """ + edges = self.edges + d = self.d + d_max = self.d_max + + boundary_edges = [edge for edge in edges if len(edges[edge]["simplices"]) < 2] + boundary_edges_length = len(boundary_edges) + next_boundary_edges = [] + + while len(next_boundary_edges) != boundary_edges_length: + next_boundary_edges = [] + + for current_edge in boundary_edges: + if current_edge not in edges: + continue + + if len(edges[current_edge]["simplices"].keys()) == 0: + del edges[current_edge] + continue + + simplex_id = list(edges[current_edge]["simplices"].keys())[0] + simplex = d.simplices[simplex_id] + + # Remove if edge is long with large angle, or if angle is very obtuse + if ( + edges[current_edge]["length"] > 1.5 * d_max + and edges[current_edge]["simplices"][simplex_id] > 90 + ) or edges[current_edge]["simplices"][simplex_id] > 180 - 180 / 16: + + for edge in self.get_edges_from_simplex(simplex): + if edge != current_edge: + edges[edge]["simplices"].pop(simplex_id) + next_boundary_edges.append(edge) + + del edges[current_edge] + else: + next_boundary_edges.append(current_edge) + + boundary_edges_length = len(boundary_edges) + boundary_edges = next_boundary_edges + + def find_cycles(self) -> Union[Polygon, MultiPolygon, None]: + """Find boundary cycles and convert to Shapely geometry. + + Returns + ------- + Union[Polygon, MultiPolygon, None] + Polygon if single cycle, MultiPolygon if multiple, None on error. + """ + e = self.edges + boundary_edges = [edge for edge in e if len(e[edge]["simplices"]) < 2] + self.graph = self.generate_graph(boundary_edges) + cycles = self.get_cycles(self.graph) + + try: + if len(cycles) == 1: + geom = Polygon(self.d.points[cycles[0]]) + else: + geom = MultiPolygon( + [Polygon(self.d.points[c]) for c in cycles if len(c) >= 3] + ) + except Exception: + return None + + return geom + + @staticmethod + def calculate_d_max(points: np.ndarray) -> float: + """Calculate maximum nearest-neighbor distance. + + Parameters + ---------- + points : np.ndarray + Point coordinates, shape (N, 2). + + Returns + ------- + float + Maximum nearest-neighbor distance. + """ + index = rtree.index.Index() + for i, p in enumerate(points): + index.insert(i, p[[0, 1, 0, 1]]) + + short_edges = [] + for i, p in enumerate(points): + res = list(index.nearest(p[[0, 1, 0, 1]], 2))[-1] + short_edges.append([i, res]) + + nearest_points = points[short_edges] + nearest_dists = np.sqrt( + (nearest_points[:, 0, 0] - nearest_points[:, 1, 0]) ** 2 + + (nearest_points[:, 0, 1] - nearest_points[:, 1, 1]) ** 2 + ) + return nearest_dists.max() + + @staticmethod + def get_edges_from_simplex(simplex: np.ndarray) -> list: + """Extract edge tuples from a triangle simplex. + + Parameters + ---------- + simplex : np.ndarray + Triangle vertex indices, shape (3,). + + Returns + ------- + list + List of edge tuples. + """ + edges = [] + for p in range(3): + edges.append(tuple(sorted((simplex[p], simplex[(p + 1) % 3])))) + return edges + + @staticmethod + def generate_graph(edges: list) -> dict: + """Generate adjacency list from edge list. + + Parameters + ---------- + edges : list + List of edge tuples. + + Returns + ------- + dict + Adjacency list representation. + """ + vertices = set() + for edge in edges: + vertices.add(edge[0]) + vertices.add(edge[1]) + + vertices = sorted(list(vertices)) + graph = {v: [] for v in vertices} + + for e in edges: + graph[e[0]].append(e[1]) + graph[e[1]].append(e[0]) + + return graph + + @staticmethod + def get_cycles(graph: dict) -> list: + """Find all connected components (cycles) in boundary graph. + + Parameters + ---------- + graph : dict + Adjacency list representation. + + Returns + ------- + list + List of cycles (each cycle is a list of vertex indices). + """ + colors = {v: 0 for v in graph} + cycles = [] + + for v in graph.keys(): + if colors[v] == 0: + cycle = [] + dfs(v, graph, cycle, colors) + cycles.append(cycle) + + return cycles + + +def generate_boundary( + df: Union[pd.DataFrame, pl.DataFrame], + x: str = "x", + y: str = "y", +) -> Union[Polygon, MultiPolygon, None]: + """Generate boundary polygon for a single cell's transcripts. + + Uses Delaunay triangulation with iterative edge refinement to produce + more accurate boundaries than simple convex hulls. + + Parameters + ---------- + df : Union[pd.DataFrame, pl.DataFrame] + Transcript data with x, y coordinates. + x : str + Column name for x coordinate. + y : str + Column name for y coordinate. + + Returns + ------- + Union[Polygon, MultiPolygon, None] + Cell boundary geometry, or None if insufficient points. + """ + # Convert Polars to pandas if needed + if isinstance(df, pl.DataFrame): + df = df.to_pandas() + + if len(df) < 3: + return None + + bi = BoundaryIdentification(df[[x, y]].values) + bi.calculate_part_1(plot=False) + bi.calculate_part_2(plot=False) + return bi.find_cycles() + + +def generate_boundaries( + df: Union[pd.DataFrame, pl.DataFrame], + x: str = "x", + y: str = "y", + cell_id: str = "seg_cell_id", + n_jobs: int = 1, + chunksize: int = 8, + progress: bool = True, +) -> gpd.GeoDataFrame: + """Generate boundaries for all cells in a segmentation result. + + Parameters + ---------- + df : Union[pd.DataFrame, pl.DataFrame] + Transcript data with cell assignments. + x : str + Column name for x coordinate. + y : str + Column name for y coordinate. + cell_id : str + Column name for cell ID. + + Returns + ------- + gpd.GeoDataFrame + GeoDataFrame with cell_id, length, and geometry columns. + """ + def iter_groups() -> Tuple[Iterable[Tuple[object, np.ndarray]], int]: + if isinstance(df, pl.DataFrame): + grouped = df.group_by(cell_id).agg( + [ + pl.col(x).list().alias("_x"), + pl.col(y).list().alias("_y"), + ] + ) + total = grouped.height + + def _gen(): + for cid, xs, ys in grouped.iter_rows(): + yield cid, np.column_stack((xs, ys)) + + return _gen(), total + + group_df = df.groupby(cell_id) + total = group_df.ngroups + + def _gen(): + for cid, t in group_df: + yield cid, t[[x, y]].to_numpy() + + return _gen(), total + + def _compute_one(item: Tuple[object, np.ndarray]) -> Tuple[object, int, Union[Polygon, MultiPolygon, None]]: + cid, points = item + n_unique_points = np.unique(points, axis=0).shape[0] + if n_unique_points < 3: + return cid, n_unique_points, None + try: + bi = BoundaryIdentification(points) + bi.calculate_part_1(plot=False) + bi.calculate_part_2(plot=False) + geom = bi.find_cycles() + except Exception: + geom = None + return cid, n_unique_points, geom + + group_iter, total = iter_groups() + res = [] + + if n_jobs and n_jobs > 1: + with ThreadPoolExecutor(max_workers=n_jobs) as ex: + iterator = ex.map(_compute_one, group_iter, chunksize=chunksize) + if progress: + iterator = tqdm(iterator, total=total, desc="Generating boundaries") + for cid, length, geom in iterator: + res.append({"cell_id": cid, "length": length, "geom": geom}) + else: + iterator = group_iter + if progress: + iterator = tqdm(iterator, total=total, desc="Generating boundaries") + for item in iterator: + cid, length, geom = _compute_one(item) + res.append({"cell_id": cid, "length": length, "geom": geom}) + + return gpd.GeoDataFrame( + data=[[b["cell_id"], b["length"]] for b in res], + geometry=[b["geom"] for b in res], + columns=["cell_id", "length"], + ) + + +def extract_largest_polygon( + geom: Union[Polygon, MultiPolygon, None], +) -> Union[Polygon, None]: + """Extract the largest polygon from a geometry. + + Parameters + ---------- + geom : Union[Polygon, MultiPolygon, None] + Input geometry. + + Returns + ------- + Union[Polygon, None] + Largest polygon, or None if input is None. + """ + if geom is None: + return None + if getattr(geom, "is_empty", False): + return None + if isinstance(geom, MultiPolygon): + candidates = [p for p in geom.geoms if p is not None and not p.is_empty] + if not candidates: + return None + return max(candidates, key=lambda p: p.area) + return geom diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 1f33866..0d10791 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -66,6 +66,10 @@ arguments: resources: - type: python_script path: script.py + # Vendored from dpeerlab/segger @ v2-incremental — provides the canonical + # Delaunay-based `generate_boundaries`. Model install stays on `main`. + - type: file + path: boundary.py engines: - type: docker @@ -103,6 +107,9 @@ engines: - geopandas - shapely - scikit-image + # rtree + tqdm are imported by the vendored boundary.py. + - rtree + - tqdm # Pre-fetch the Cellpose model weights so runtime doesn't hit the # network inside the Nextflow worker. # - type: python diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 18857f3..39a78c8 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -1,7 +1,7 @@ import os import shutil import subprocess -from collections import Counter +import sys from pathlib import Path import anndata as ad @@ -13,8 +13,12 @@ import torch import xarray as xr from shapely.geometry import MultiPoint, Polygon -from spatialdata.models import Labels2DModel -from spatialdata.transformations import get_transformation +from spatialdata.models import Labels2DModel, ShapesModel +from spatialdata.transformations import Identity, get_transformation + +# Sibling vendored module — see boundary.py header. +sys.path.insert(0, str(Path(__file__).parent)) +from boundary import generate_boundaries, extract_largest_polygon # noqa: E402 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device, flush=True) @@ -249,32 +253,94 @@ def _run_segger(xenium_dir: Path, output_dir: Path) -> Path: return pq -def _relabel_initial_mask( - initial_labels: np.ndarray, - tx_pixel_xy: np.ndarray, - tx_segger_cell: np.ndarray, +_BOUNDARY_WORKER_LADDER = (16, 8, 4, 2) + + +def _compute_cell_boundaries(assigned_tx: pd.DataFrame) -> gpd.GeoDataFrame: + """Build a per-cell smooth boundary polygon from the transcripts segger + assigned to that cell, using the vendored Delaunay-based builder. Tries + `n_jobs` in {16, 8, 4, 2} order and falls back on each failure so that + threading saturation or low-core hosts don't make the whole step fail. + + `assigned_tx` must carry columns 'x', 'y', 'segger_cell_id'.""" + last_err: Exception | None = None + for n_jobs in _BOUNDARY_WORKER_LADDER: + try: + return generate_boundaries( + assigned_tx, + x="x", + y="y", + cell_id="segger_cell_id", + n_jobs=n_jobs, + progress=False, + ) + except Exception as err: # noqa: BLE001 — fallback path is intentional + last_err = err + print( + f"generate_boundaries failed with n_jobs={n_jobs}: {err!r}; " + "retrying with fewer workers.", + flush=True, + ) + assert last_err is not None + raise RuntimeError("generate_boundaries failed at all worker counts") from last_err + + +def _rasterize_cell_boundaries( + boundaries: gpd.GeoDataFrame, image_element, label_col: str ) -> np.ndarray: - """For each label in `initial_labels`, replace it by the majority segger - cell id of the transcripts that fall on top of it. Pixels whose initial - label has no covering transcripts are left at 0 (background).""" - H, W = initial_labels.shape - ys = np.clip(np.round(tx_pixel_xy[:, 1]).astype(int), 0, H - 1) - xs = np.clip(np.round(tx_pixel_xy[:, 0]).astype(int), 0, W - 1) - tx_init = initial_labels[ys, xs] - relabel = {0: 0} - for init_id in np.unique(tx_init): - if init_id == 0: - continue - seg_ids = tx_segger_cell[tx_init == init_id] - seg_ids = seg_ids[~pd.isna(seg_ids)] - if seg_ids.size == 0: + """Burn cell boundaries onto the image grid as an integer label image. + + Polygons are expected in global coordinates; the image element's + global-to-pixel affine is applied before rasterizing. MultiPolygons + are collapsed to their largest polygon (`extract_largest_polygon`) + before drawing.""" + from skimage.draw import polygon as draw_polygon + + H, W = image_element.shape[-2:] + M_g2p = _affine_global_to_pixel(image_element) + labels = np.zeros((H, W), dtype=np.uint32) + for cid, geom in zip( + boundaries[label_col].to_numpy(), boundaries.geometry.to_numpy() + ): + poly = extract_largest_polygon(geom) + if poly is None or poly.is_empty: continue - most = Counter(seg_ids.astype(np.int64).tolist()).most_common(1)[0][0] - relabel[int(init_id)] = int(most) - out = np.zeros_like(initial_labels, dtype=np.int64) - for k, v in relabel.items(): - out[initial_labels == k] = v - return out + ring = np.asarray(poly.exterior.coords) + pix = _apply_affine(M_g2p, ring) + rr, cc = draw_polygon(pix[:, 1], pix[:, 0], shape=labels.shape) + labels[rr, cc] = int(cid) + return labels + + +def _build_per_cell_table( + assigned_tx: pd.DataFrame, dataset_id: str, method_id: str +) -> ad.AnnData: + """Build a cells x genes count AnnData from per-transcript segger + assignments. Mirrors what `process_prediction` will re-derive from the + label image, but bundling it here keeps `prediction.zarr` self-contained + for QC and analysis scripts that consume it directly.""" + if assigned_tx.empty: + return ad.AnnData( + uns={"dataset_id": dataset_id, "method_id": method_id} + ) + counts = ( + assigned_tx.groupby(["segger_cell_id", "feature_name"]) + .size() + .unstack(fill_value=0) + ) + obs = pd.DataFrame( + {"cell_id": counts.index.astype(str)}, + index=counts.index.astype(str), + ) + obs["region"] = pd.Categorical(["segmentation"] * len(obs)) + var = pd.DataFrame(index=counts.columns.astype(str)) + var.index.name = "feature_name" + var["feature_name"] = var.index + table = ad.AnnData(X=counts.values.astype(np.float32), obs=obs, var=var) + table.layers["counts"] = table.X.copy() + table.uns["dataset_id"] = dataset_id + table.uns["method_id"] = method_id + return table # ----------------------------- main -------------------------------- # @@ -328,9 +394,8 @@ def _relabel_initial_mask( print(f"segger emitted {seg.height} rows", flush=True) # Keep only confident assignments. Mirror segger's canonical -# valid_cell_id_expr (null / "-1" / "UNASSIGNED" / "NONE") so that the -# -1 unassigned sentinel segger emits doesn't reach _relabel_initial_mask -# and ultimately the uint32 labels image. +# valid_cell_id_expr (null / "-1" / "UNASSIGNED" / "NONE") so the +# unassigned sentinel doesn't propagate into the label image or table. _seg_id_str = pl.col("segger_cell_id").cast(pl.Utf8).str.to_uppercase() seg = seg.filter( pl.col("keep") @@ -340,19 +405,107 @@ def _relabel_initial_mask( & (_seg_id_str != "NONE") ) print(f"kept assignments: {seg.height}", flush=True) + +# --- Step 3: rebuild the cell footprint from segger's assignments --- +# Replaces the older majority-vote-over-initial-mask approach with the +# team's canonical Delaunay alpha-shape boundaries (see boundary.py), +# rasterized as the segmentation label image. This way the label image +# encodes segger's FULL per-cell transcript footprint — including +# transcripts that segger rescued outside the initial nucleus — instead +# of being clipped back to nucleus shape. +dataset_id = sdata.tables["table"].uns["dataset_id"] +shapes_out: dict = {} +table: ad.AnnData + if seg.height == 0: - print("WARNING: segger kept no transcript assignments — falling back to the initial mask.", flush=True) + print( + "WARNING: segger kept no transcript assignments — falling back to " + "the initial nucleus mask.", + flush=True, + ) final_labels = initial_labels.astype(np.int64) + table = ad.AnnData(uns={"dataset_id": dataset_id, "method_id": meta["name"]}) else: row_idx = seg["row_index"].to_numpy() segger_cell = seg["segger_cell_id"].to_numpy() xy_global = tx_pd.loc[row_idx, ["x", "y"]].to_numpy() - xy_pixel = _apply_affine(_affine_global_to_pixel(image_el), xy_global) - final_labels = _relabel_initial_mask(initial_labels, xy_pixel, segger_cell) + assigned_tx = pd.DataFrame({ + "x": xy_global[:, 0], + "y": xy_global[:, 1], + "feature_name": tx_pd.loc[row_idx, "feature_name"].astype(str).to_numpy(), + "segger_cell_id": np.asarray(segger_cell, dtype=np.int64), + }) + print( + f"computing boundaries for {assigned_tx['segger_cell_id'].nunique()} cells " + f"from {len(assigned_tx)} assigned transcripts", + flush=True, + ) + try: + boundaries_gdf = _compute_cell_boundaries(assigned_tx) + except Exception as err: # noqa: BLE001 — fall back to the initial mask + print( + f"WARNING: boundary computation failed ({err!r}); falling back " + "to the initial nucleus mask for the label image.", + flush=True, + ) + boundaries_gdf = None + + valid = ( + boundaries_gdf.dropna(subset=["geometry"]).copy() + if boundaries_gdf is not None + else None + ) + if valid is not None and not valid.empty: + valid = valid[ + valid.geometry.apply(lambda g: g is not None and not g.is_empty) + ] + print( + "generate_boundaries: " + f"{0 if valid is None else len(valid)} usable polygons", + flush=True, + ) + + if valid is None or valid.empty: + print( + "WARNING: no smooth boundaries produced — using the initial " + "nucleus mask as the label image.", + flush=True, + ) + final_labels = initial_labels.astype(np.int64) + else: + # Collapse any MultiPolygons to their largest component once and reuse + # the result for both the rasterized label image and the shapes export. + valid = valid.assign( + geometry=valid.geometry.apply(extract_largest_polygon), + cell_id=valid["cell_id"].astype(np.int64), + ) + valid = valid[valid.geometry.apply(lambda g: g is not None and not g.is_empty)] + final_labels = _rasterize_cell_boundaries( + valid, image_el, label_col="cell_id" + ) + # Restrict the per-cell table to cells whose boundary actually + # rendered so the table and label image agree on cell membership. + rendered = set(valid["cell_id"].tolist()) + assigned_tx = assigned_tx[ + assigned_tx["segger_cell_id"].isin(rendered) + ] + boundaries_global = gpd.GeoDataFrame( + { + "cell_id": valid["cell_id"].astype(str).to_numpy(), + "geometry": valid.geometry.to_numpy(), + }, + geometry="geometry", + ) + shapes_out["cell_boundaries"] = ShapesModel.parse( + boundaries_global, + transformations={"global": Identity()}, + ) + + table = _build_per_cell_table(assigned_tx, dataset_id, meta["name"]) final_labels = _to_lower_uint(final_labels) -# --- Step 3: write the SpatialData prediction --- +# --- Step 4: write the SpatialData prediction --- sd_out = sd.SpatialData( labels={ "segmentation": Labels2DModel.parse( @@ -360,14 +513,8 @@ def _relabel_initial_mask( transformations=image_transform, ), }, - tables={ - "table": ad.AnnData( - uns={ - "dataset_id": sdata.tables["table"].uns["dataset_id"], - "method_id": meta["name"], - } - ), - }, + shapes=shapes_out if shapes_out else None, + tables={"table": table}, ) print(f"Saving output: {output_path}", flush=True) From 48aed734836230366b0697c85ac0524c12a5e8d1 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 12:32:58 +0200 Subject: [PATCH 16/26] Minimal label image for segger: pixel-paint per assigned transcript README spec for `file_prediction.zarr` is: labels: 'segmentation' tables: 'table' (uns.dataset_id, uns.method_id) process_prediction derives the cells x genes table by indexing `label_image[int(t.y), int(t.x)]` for every input transcript. The smallest valid label image is therefore one where each segger-assigned transcript's truncated pixel carries its `segger_cell_id`; unassigned transcripts fall on un-painted 0 and are filtered out by process_prediction's `cell_id != 0` rule. That's now `_rasterize_assigned_transcripts`: one affine, one int cast, one fancy-indexed numpy write. O(N) in #assigned transcripts, no KDTree, no per-cell loop, no Delaunay. Drops the vendored boundary.py and the rtree/tqdm deps along with the shapes export and the proper-anndata helper (process_prediction rebuilds the table from the label image anyway). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/boundary.py | 536 ----------------------------- src/methods/segger/config.vsh.yaml | 7 - src/methods/segger/script.py | 216 +++--------- 3 files changed, 46 insertions(+), 713 deletions(-) delete mode 100644 src/methods/segger/boundary.py diff --git a/src/methods/segger/boundary.py b/src/methods/segger/boundary.py deleted file mode 100644 index 4bfb499..0000000 --- a/src/methods/segger/boundary.py +++ /dev/null @@ -1,536 +0,0 @@ -"""Delaunay triangulation-based cell boundary generation. - -This module provides sophisticated boundary extraction using Delaunay triangulation -with iterative edge refinement and cycle detection. This produces more accurate -cell boundaries than simple convex hulls. - -VENDORED from dpeerlab/segger @ branch `v2-incremental`, -path `src/segger/export/boundary.py`. The trunk model on `main` does not -ship this module, but the team's analysis pipeline (segger-analysis -notebooks/fov_analysis, e.g. `plot_vd_pixel_method_masks.py`) uses -`generate_boundaries` as the canonical smooth-mask builder. Vendoring -keeps the model install on `main` (per request) while still rendering -segger's per-cell masks the way the rest of the lab's tooling expects. - -Treat this file as upstream — if it needs to change, change v2-incremental -and re-copy. Do NOT hand-edit. -""" - -from typing import Iterable, Tuple, Union -from concurrent.futures import ThreadPoolExecutor -import geopandas as gpd -import numpy as np -import pandas as pd -import polars as pl -import rtree.index -from scipy.spatial import Delaunay -from shapely.geometry import MultiPolygon, Polygon -from tqdm import tqdm - - -def vector_angle(v1: np.ndarray, v2: np.ndarray) -> float: - """Calculate angle between two vectors in degrees. - - Parameters - ---------- - v1 : np.ndarray - First vector. - v2 : np.ndarray - Second vector. - - Returns - ------- - float - Angle in degrees. - """ - dot_product = np.dot(v1, v2) - magnitude_v1 = np.linalg.norm(v1) - magnitude_v2 = np.linalg.norm(v2) - cos_angle = np.clip(dot_product / (magnitude_v1 * magnitude_v2 + 1e-8), -1.0, 1.0) - return np.degrees(np.arccos(cos_angle)) - - -def triangle_angles_from_points( - points: np.ndarray, - triangles: np.ndarray, -) -> np.ndarray: - """Calculate angles for all triangles in a Delaunay triangulation. - - Parameters - ---------- - points : np.ndarray - Point coordinates, shape (N, 2). - triangles : np.ndarray - Triangle vertex indices, shape (M, 3). - - Returns - ------- - np.ndarray - Angles for each triangle vertex, shape (M, 3). - """ - # Vectorized angle computation for all triangles - p1 = points[triangles[:, 0]] - p2 = points[triangles[:, 1]] - p3 = points[triangles[:, 2]] - - v1 = p2 - p1 - v2 = p3 - p1 - v3 = p3 - p2 - - def _angles(u: np.ndarray, v: np.ndarray) -> np.ndarray: - dot = (u * v).sum(axis=1) - denom = (np.linalg.norm(u, axis=1) * np.linalg.norm(v, axis=1)) + 1e-8 - cos = np.clip(dot / denom, -1.0, 1.0) - return np.degrees(np.arccos(cos)) - - a = _angles(v1, v2) - b = _angles(-v1, v3) - c = _angles(-v2, -v3) - return np.stack([a, b, c], axis=1) - - -def dfs(v: int, graph: dict, path: list, colors: dict) -> None: - """Depth-first search for cycle detection. - - Parameters - ---------- - v : int - Current vertex. - graph : dict - Adjacency list representation of graph. - path : list - Current path being built. - colors : dict - Vertex visit status (0=unvisited, 1=visited). - """ - colors[v] = 1 - path.append(v) - for d in graph[v]: - if colors[d] == 0: - dfs(d, graph, path, colors) - - -class BoundaryIdentification: - """Delaunay triangulation-based polygon boundary extraction. - - This class implements a two-phase iterative algorithm for extracting - cell boundaries from transcript point clouds: - - 1. Phase 1: Remove long boundary edges (> 2 * d_max) - 2. Phase 2: Remove boundary edges with extreme angles - - Parameters - ---------- - data : np.ndarray - 2D point coordinates, shape (N, 2). - """ - - def __init__(self, data: np.ndarray): - self.graph = None - self.edges = {} - self.d = Delaunay(data) - self.d_max = self.calculate_d_max(self.d.points) - self.generate_edges() - - def generate_edges(self) -> None: - """Generate edge dictionary from Delaunay triangulation.""" - d = self.d - edges = {} - angles = triangle_angles_from_points(d.points, d.simplices) - - for index, simplex in enumerate(d.simplices): - for p in range(3): - edge = tuple(sorted((simplex[p], simplex[(p + 1) % 3]))) - if edge not in edges: - edges[edge] = {"simplices": {}} - edges[edge]["simplices"][index] = angles[index][(p + 2) % 3] - - edges_coordinates = d.points[np.array(list(edges.keys()))] - edges_length = np.sqrt( - (edges_coordinates[:, 1, 0] - edges_coordinates[:, 0, 0]) ** 2 - + (edges_coordinates[:, 1, 1] - edges_coordinates[:, 0, 1]) ** 2 - ) - - for edge, coords, length in zip(edges, edges_coordinates, edges_length): - edges[edge]["coords"] = coords - edges[edge]["length"] = length - - self.edges = edges - - def calculate_part_1(self, plot: bool = False) -> None: - """Phase 1: Remove long boundary edges iteratively. - - Removes edges longer than 2 * d_max from the boundary. - - Parameters - ---------- - plot : bool - Whether to generate visualization (not implemented). - """ - edges = self.edges - d = self.d - d_max = self.d_max - - boundary_edges = [edge for edge in edges if len(edges[edge]["simplices"]) < 2] - - flag = True - while flag: - flag = False - next_boundary_edges = [] - - for current_edge in boundary_edges: - if current_edge not in edges: - continue - - if edges[current_edge]["length"] > 2 * d_max: - if len(edges[current_edge]["simplices"].keys()) == 0: - del edges[current_edge] - continue - - simplex_id = list(edges[current_edge]["simplices"].keys())[0] - simplex = d.simplices[simplex_id] - - for edge in self.get_edges_from_simplex(simplex): - if edge != current_edge: - edges[edge]["simplices"].pop(simplex_id) - next_boundary_edges.append(edge) - - del edges[current_edge] - flag = True - else: - next_boundary_edges.append(current_edge) - - boundary_edges = next_boundary_edges - - def calculate_part_2(self, plot: bool = False) -> None: - """Phase 2: Remove boundary edges with extreme angles. - - Removes edges where the opposite angle is too large, indicating - a concave region that should be excluded. - - Parameters - ---------- - plot : bool - Whether to generate visualization (not implemented). - """ - edges = self.edges - d = self.d - d_max = self.d_max - - boundary_edges = [edge for edge in edges if len(edges[edge]["simplices"]) < 2] - boundary_edges_length = len(boundary_edges) - next_boundary_edges = [] - - while len(next_boundary_edges) != boundary_edges_length: - next_boundary_edges = [] - - for current_edge in boundary_edges: - if current_edge not in edges: - continue - - if len(edges[current_edge]["simplices"].keys()) == 0: - del edges[current_edge] - continue - - simplex_id = list(edges[current_edge]["simplices"].keys())[0] - simplex = d.simplices[simplex_id] - - # Remove if edge is long with large angle, or if angle is very obtuse - if ( - edges[current_edge]["length"] > 1.5 * d_max - and edges[current_edge]["simplices"][simplex_id] > 90 - ) or edges[current_edge]["simplices"][simplex_id] > 180 - 180 / 16: - - for edge in self.get_edges_from_simplex(simplex): - if edge != current_edge: - edges[edge]["simplices"].pop(simplex_id) - next_boundary_edges.append(edge) - - del edges[current_edge] - else: - next_boundary_edges.append(current_edge) - - boundary_edges_length = len(boundary_edges) - boundary_edges = next_boundary_edges - - def find_cycles(self) -> Union[Polygon, MultiPolygon, None]: - """Find boundary cycles and convert to Shapely geometry. - - Returns - ------- - Union[Polygon, MultiPolygon, None] - Polygon if single cycle, MultiPolygon if multiple, None on error. - """ - e = self.edges - boundary_edges = [edge for edge in e if len(e[edge]["simplices"]) < 2] - self.graph = self.generate_graph(boundary_edges) - cycles = self.get_cycles(self.graph) - - try: - if len(cycles) == 1: - geom = Polygon(self.d.points[cycles[0]]) - else: - geom = MultiPolygon( - [Polygon(self.d.points[c]) for c in cycles if len(c) >= 3] - ) - except Exception: - return None - - return geom - - @staticmethod - def calculate_d_max(points: np.ndarray) -> float: - """Calculate maximum nearest-neighbor distance. - - Parameters - ---------- - points : np.ndarray - Point coordinates, shape (N, 2). - - Returns - ------- - float - Maximum nearest-neighbor distance. - """ - index = rtree.index.Index() - for i, p in enumerate(points): - index.insert(i, p[[0, 1, 0, 1]]) - - short_edges = [] - for i, p in enumerate(points): - res = list(index.nearest(p[[0, 1, 0, 1]], 2))[-1] - short_edges.append([i, res]) - - nearest_points = points[short_edges] - nearest_dists = np.sqrt( - (nearest_points[:, 0, 0] - nearest_points[:, 1, 0]) ** 2 - + (nearest_points[:, 0, 1] - nearest_points[:, 1, 1]) ** 2 - ) - return nearest_dists.max() - - @staticmethod - def get_edges_from_simplex(simplex: np.ndarray) -> list: - """Extract edge tuples from a triangle simplex. - - Parameters - ---------- - simplex : np.ndarray - Triangle vertex indices, shape (3,). - - Returns - ------- - list - List of edge tuples. - """ - edges = [] - for p in range(3): - edges.append(tuple(sorted((simplex[p], simplex[(p + 1) % 3])))) - return edges - - @staticmethod - def generate_graph(edges: list) -> dict: - """Generate adjacency list from edge list. - - Parameters - ---------- - edges : list - List of edge tuples. - - Returns - ------- - dict - Adjacency list representation. - """ - vertices = set() - for edge in edges: - vertices.add(edge[0]) - vertices.add(edge[1]) - - vertices = sorted(list(vertices)) - graph = {v: [] for v in vertices} - - for e in edges: - graph[e[0]].append(e[1]) - graph[e[1]].append(e[0]) - - return graph - - @staticmethod - def get_cycles(graph: dict) -> list: - """Find all connected components (cycles) in boundary graph. - - Parameters - ---------- - graph : dict - Adjacency list representation. - - Returns - ------- - list - List of cycles (each cycle is a list of vertex indices). - """ - colors = {v: 0 for v in graph} - cycles = [] - - for v in graph.keys(): - if colors[v] == 0: - cycle = [] - dfs(v, graph, cycle, colors) - cycles.append(cycle) - - return cycles - - -def generate_boundary( - df: Union[pd.DataFrame, pl.DataFrame], - x: str = "x", - y: str = "y", -) -> Union[Polygon, MultiPolygon, None]: - """Generate boundary polygon for a single cell's transcripts. - - Uses Delaunay triangulation with iterative edge refinement to produce - more accurate boundaries than simple convex hulls. - - Parameters - ---------- - df : Union[pd.DataFrame, pl.DataFrame] - Transcript data with x, y coordinates. - x : str - Column name for x coordinate. - y : str - Column name for y coordinate. - - Returns - ------- - Union[Polygon, MultiPolygon, None] - Cell boundary geometry, or None if insufficient points. - """ - # Convert Polars to pandas if needed - if isinstance(df, pl.DataFrame): - df = df.to_pandas() - - if len(df) < 3: - return None - - bi = BoundaryIdentification(df[[x, y]].values) - bi.calculate_part_1(plot=False) - bi.calculate_part_2(plot=False) - return bi.find_cycles() - - -def generate_boundaries( - df: Union[pd.DataFrame, pl.DataFrame], - x: str = "x", - y: str = "y", - cell_id: str = "seg_cell_id", - n_jobs: int = 1, - chunksize: int = 8, - progress: bool = True, -) -> gpd.GeoDataFrame: - """Generate boundaries for all cells in a segmentation result. - - Parameters - ---------- - df : Union[pd.DataFrame, pl.DataFrame] - Transcript data with cell assignments. - x : str - Column name for x coordinate. - y : str - Column name for y coordinate. - cell_id : str - Column name for cell ID. - - Returns - ------- - gpd.GeoDataFrame - GeoDataFrame with cell_id, length, and geometry columns. - """ - def iter_groups() -> Tuple[Iterable[Tuple[object, np.ndarray]], int]: - if isinstance(df, pl.DataFrame): - grouped = df.group_by(cell_id).agg( - [ - pl.col(x).list().alias("_x"), - pl.col(y).list().alias("_y"), - ] - ) - total = grouped.height - - def _gen(): - for cid, xs, ys in grouped.iter_rows(): - yield cid, np.column_stack((xs, ys)) - - return _gen(), total - - group_df = df.groupby(cell_id) - total = group_df.ngroups - - def _gen(): - for cid, t in group_df: - yield cid, t[[x, y]].to_numpy() - - return _gen(), total - - def _compute_one(item: Tuple[object, np.ndarray]) -> Tuple[object, int, Union[Polygon, MultiPolygon, None]]: - cid, points = item - n_unique_points = np.unique(points, axis=0).shape[0] - if n_unique_points < 3: - return cid, n_unique_points, None - try: - bi = BoundaryIdentification(points) - bi.calculate_part_1(plot=False) - bi.calculate_part_2(plot=False) - geom = bi.find_cycles() - except Exception: - geom = None - return cid, n_unique_points, geom - - group_iter, total = iter_groups() - res = [] - - if n_jobs and n_jobs > 1: - with ThreadPoolExecutor(max_workers=n_jobs) as ex: - iterator = ex.map(_compute_one, group_iter, chunksize=chunksize) - if progress: - iterator = tqdm(iterator, total=total, desc="Generating boundaries") - for cid, length, geom in iterator: - res.append({"cell_id": cid, "length": length, "geom": geom}) - else: - iterator = group_iter - if progress: - iterator = tqdm(iterator, total=total, desc="Generating boundaries") - for item in iterator: - cid, length, geom = _compute_one(item) - res.append({"cell_id": cid, "length": length, "geom": geom}) - - return gpd.GeoDataFrame( - data=[[b["cell_id"], b["length"]] for b in res], - geometry=[b["geom"] for b in res], - columns=["cell_id", "length"], - ) - - -def extract_largest_polygon( - geom: Union[Polygon, MultiPolygon, None], -) -> Union[Polygon, None]: - """Extract the largest polygon from a geometry. - - Parameters - ---------- - geom : Union[Polygon, MultiPolygon, None] - Input geometry. - - Returns - ------- - Union[Polygon, None] - Largest polygon, or None if input is None. - """ - if geom is None: - return None - if getattr(geom, "is_empty", False): - return None - if isinstance(geom, MultiPolygon): - candidates = [p for p in geom.geoms if p is not None and not p.is_empty] - if not candidates: - return None - return max(candidates, key=lambda p: p.area) - return geom diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 0d10791..1f33866 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -66,10 +66,6 @@ arguments: resources: - type: python_script path: script.py - # Vendored from dpeerlab/segger @ v2-incremental — provides the canonical - # Delaunay-based `generate_boundaries`. Model install stays on `main`. - - type: file - path: boundary.py engines: - type: docker @@ -107,9 +103,6 @@ engines: - geopandas - shapely - scikit-image - # rtree + tqdm are imported by the vendored boundary.py. - - rtree - - tqdm # Pre-fetch the Cellpose model weights so runtime doesn't hit the # network inside the Nextflow worker. # - type: python diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 39a78c8..46d25f0 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -1,7 +1,6 @@ import os import shutil import subprocess -import sys from pathlib import Path import anndata as ad @@ -13,12 +12,8 @@ import torch import xarray as xr from shapely.geometry import MultiPoint, Polygon -from spatialdata.models import Labels2DModel, ShapesModel -from spatialdata.transformations import Identity, get_transformation - -# Sibling vendored module — see boundary.py header. -sys.path.insert(0, str(Path(__file__).parent)) -from boundary import generate_boundaries, extract_largest_polygon # noqa: E402 +from spatialdata.models import Labels2DModel +from spatialdata.transformations import get_transformation device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Using device:", device, flush=True) @@ -253,96 +248,37 @@ def _run_segger(xenium_dir: Path, output_dir: Path) -> Path: return pq -_BOUNDARY_WORKER_LADDER = (16, 8, 4, 2) - - -def _compute_cell_boundaries(assigned_tx: pd.DataFrame) -> gpd.GeoDataFrame: - """Build a per-cell smooth boundary polygon from the transcripts segger - assigned to that cell, using the vendored Delaunay-based builder. Tries - `n_jobs` in {16, 8, 4, 2} order and falls back on each failure so that - threading saturation or low-core hosts don't make the whole step fail. - - `assigned_tx` must carry columns 'x', 'y', 'segger_cell_id'.""" - last_err: Exception | None = None - for n_jobs in _BOUNDARY_WORKER_LADDER: - try: - return generate_boundaries( - assigned_tx, - x="x", - y="y", - cell_id="segger_cell_id", - n_jobs=n_jobs, - progress=False, - ) - except Exception as err: # noqa: BLE001 — fallback path is intentional - last_err = err - print( - f"generate_boundaries failed with n_jobs={n_jobs}: {err!r}; " - "retrying with fewer workers.", - flush=True, - ) - assert last_err is not None - raise RuntimeError("generate_boundaries failed at all worker counts") from last_err - - -def _rasterize_cell_boundaries( - boundaries: gpd.GeoDataFrame, image_element, label_col: str +def _rasterize_assigned_transcripts( + assigned_tx: pd.DataFrame, image_element ) -> np.ndarray: - """Burn cell boundaries onto the image grid as an integer label image. - - Polygons are expected in global coordinates; the image element's - global-to-pixel affine is applied before rasterizing. MultiPolygons - are collapsed to their largest polygon (`extract_largest_polygon`) - before drawing.""" - from skimage.draw import polygon as draw_polygon - + """Paint each segger-assigned transcript's pixel with its `segger_cell_id`. + + This is the minimum label image that satisfies the repo contract. + `process_prediction` derives the cells x genes table by indexing the + label image at `(int(t.y), int(t.x))` for every input transcript and + grouping; painting each assigned transcript's truncated pixel with + its cell id makes that lookup return the right cell. Unassigned + transcripts fall on un-painted background (0) and are correctly + excluded by `process_prediction`'s `cell_id != 0` filter. + + O(N) in #assigned transcripts: one affine, one int cast, one fancy + indexed write. No tree, no grid scan, no per-cell loop. + """ H, W = image_element.shape[-2:] M_g2p = _affine_global_to_pixel(image_element) + xy_pix = _apply_affine(M_g2p, assigned_tx[["x", "y"]].to_numpy()) + # Match process_prediction's int64() cast (truncation toward zero). + ys = xy_pix[:, 1].astype(np.int64) + xs = xy_pix[:, 0].astype(np.int64) + inside = (ys >= 0) & (ys < H) & (xs >= 0) & (xs < W) + ys = ys[inside] + xs = xs[inside] + cell_ids = assigned_tx["segger_cell_id"].to_numpy(dtype=np.int64)[inside] labels = np.zeros((H, W), dtype=np.uint32) - for cid, geom in zip( - boundaries[label_col].to_numpy(), boundaries.geometry.to_numpy() - ): - poly = extract_largest_polygon(geom) - if poly is None or poly.is_empty: - continue - ring = np.asarray(poly.exterior.coords) - pix = _apply_affine(M_g2p, ring) - rr, cc = draw_polygon(pix[:, 1], pix[:, 0], shape=labels.shape) - labels[rr, cc] = int(cid) + labels[ys, xs] = cell_ids.astype(np.uint32) return labels -def _build_per_cell_table( - assigned_tx: pd.DataFrame, dataset_id: str, method_id: str -) -> ad.AnnData: - """Build a cells x genes count AnnData from per-transcript segger - assignments. Mirrors what `process_prediction` will re-derive from the - label image, but bundling it here keeps `prediction.zarr` self-contained - for QC and analysis scripts that consume it directly.""" - if assigned_tx.empty: - return ad.AnnData( - uns={"dataset_id": dataset_id, "method_id": method_id} - ) - counts = ( - assigned_tx.groupby(["segger_cell_id", "feature_name"]) - .size() - .unstack(fill_value=0) - ) - obs = pd.DataFrame( - {"cell_id": counts.index.astype(str)}, - index=counts.index.astype(str), - ) - obs["region"] = pd.Categorical(["segmentation"] * len(obs)) - var = pd.DataFrame(index=counts.columns.astype(str)) - var.index.name = "feature_name" - var["feature_name"] = var.index - table = ad.AnnData(X=counts.values.astype(np.float32), obs=obs, var=var) - table.layers["counts"] = table.X.copy() - table.uns["dataset_id"] = dataset_id - table.uns["method_id"] = method_id - return table - - # ----------------------------- main -------------------------------- # input_path = Path(par["input"]) @@ -406,16 +342,14 @@ def _build_per_cell_table( ) print(f"kept assignments: {seg.height}", flush=True) -# --- Step 3: rebuild the cell footprint from segger's assignments --- -# Replaces the older majority-vote-over-initial-mask approach with the -# team's canonical Delaunay alpha-shape boundaries (see boundary.py), -# rasterized as the segmentation label image. This way the label image -# encodes segger's FULL per-cell transcript footprint — including -# transcripts that segger rescued outside the initial nucleus — instead -# of being clipped back to nucleus shape. +# --- Step 3: build the minimal label image required by the repo --- +# README spec is: labels.segmentation + tables.table with +# uns.dataset_id and uns.method_id. process_prediction derives the +# cells x genes table itself by `label_image[int(t.y), int(t.x)]` for +# every input transcript. So the smallest valid label image is one +# where each segger-assigned transcript's truncated pixel carries its +# segger_cell_id; everything else stays background. dataset_id = sdata.tables["table"].uns["dataset_id"] -shapes_out: dict = {} -table: ad.AnnData if seg.height == 0: print( @@ -424,84 +358,20 @@ def _build_per_cell_table( flush=True, ) final_labels = initial_labels.astype(np.int64) - table = ad.AnnData(uns={"dataset_id": dataset_id, "method_id": meta["name"]}) else: row_idx = seg["row_index"].to_numpy() segger_cell = seg["segger_cell_id"].to_numpy() - xy_global = tx_pd.loc[row_idx, ["x", "y"]].to_numpy() assigned_tx = pd.DataFrame({ - "x": xy_global[:, 0], - "y": xy_global[:, 1], - "feature_name": tx_pd.loc[row_idx, "feature_name"].astype(str).to_numpy(), + "x": tx_pd.loc[row_idx, "x"].to_numpy(), + "y": tx_pd.loc[row_idx, "y"].to_numpy(), "segger_cell_id": np.asarray(segger_cell, dtype=np.int64), }) print( - f"computing boundaries for {assigned_tx['segger_cell_id'].nunique()} cells " - f"from {len(assigned_tx)} assigned transcripts", - flush=True, - ) - try: - boundaries_gdf = _compute_cell_boundaries(assigned_tx) - except Exception as err: # noqa: BLE001 — fall back to the initial mask - print( - f"WARNING: boundary computation failed ({err!r}); falling back " - "to the initial nucleus mask for the label image.", - flush=True, - ) - boundaries_gdf = None - - valid = ( - boundaries_gdf.dropna(subset=["geometry"]).copy() - if boundaries_gdf is not None - else None - ) - if valid is not None and not valid.empty: - valid = valid[ - valid.geometry.apply(lambda g: g is not None and not g.is_empty) - ] - print( - "generate_boundaries: " - f"{0 if valid is None else len(valid)} usable polygons", + f"painting {len(assigned_tx)} assigned transcripts across " + f"{assigned_tx['segger_cell_id'].nunique()} cells", flush=True, ) - - if valid is None or valid.empty: - print( - "WARNING: no smooth boundaries produced — using the initial " - "nucleus mask as the label image.", - flush=True, - ) - final_labels = initial_labels.astype(np.int64) - else: - # Collapse any MultiPolygons to their largest component once and reuse - # the result for both the rasterized label image and the shapes export. - valid = valid.assign( - geometry=valid.geometry.apply(extract_largest_polygon), - cell_id=valid["cell_id"].astype(np.int64), - ) - valid = valid[valid.geometry.apply(lambda g: g is not None and not g.is_empty)] - final_labels = _rasterize_cell_boundaries( - valid, image_el, label_col="cell_id" - ) - # Restrict the per-cell table to cells whose boundary actually - # rendered so the table and label image agree on cell membership. - rendered = set(valid["cell_id"].tolist()) - assigned_tx = assigned_tx[ - assigned_tx["segger_cell_id"].isin(rendered) - ] - boundaries_global = gpd.GeoDataFrame( - { - "cell_id": valid["cell_id"].astype(str).to_numpy(), - "geometry": valid.geometry.to_numpy(), - }, - geometry="geometry", - ) - shapes_out["cell_boundaries"] = ShapesModel.parse( - boundaries_global, - transformations={"global": Identity()}, - ) - - table = _build_per_cell_table(assigned_tx, dataset_id, meta["name"]) + final_labels = _rasterize_assigned_transcripts(assigned_tx, image_el) final_labels = _to_lower_uint(final_labels) @@ -513,8 +383,14 @@ def _build_per_cell_table( transformations=image_transform, ), }, - shapes=shapes_out if shapes_out else None, - tables={"table": table}, + tables={ + "table": ad.AnnData( + uns={ + "dataset_id": dataset_id, + "method_id": meta["name"], + } + ), + }, ) print(f"Saving output: {output_path}", flush=True) From 4b3ddc85de96767685cab3a13df745f755379607 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 13:10:43 +0200 Subject: [PATCH 17/26] Set production defaults for --init_segmentation and --cellpose_diameter Both args only had `info.test_default` (consumed by viash test) and no production `default`, so Nextflow runs of the workflow passed nothing to the component and `par["init_segmentation"]` arrived as None, which fell through to `raise ValueError("Unknown init_segmentation mode: None")`. Set the production defaults to match what tests already used (auto and 30). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/config.vsh.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 1f33866..57c568f 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -31,6 +31,7 @@ arguments: - name: --init_segmentation type: string choices: [auto, cellpose, transcript_cell_id] + default: auto description: | Source of the initial boundary node set that segger trains against. 'auto' uses the transcripts' `cell_id` prior column when it is @@ -38,8 +39,6 @@ arguments: 'cellpose' always runs Cellpose. 'transcript_cell_id' requires a `cell_id` column on the transcripts and builds one convex-hull polygon per cell id. - info: - test_default: auto - name: --n_epochs type: integer default: 20 @@ -59,9 +58,8 @@ arguments: description: "Which polygon set drives the prediction graph." - name: --cellpose_diameter type: double + default: 30 description: "Cell diameter (pixels) for the Cellpose nucleus pre-segmentation." - info: - test_default: 30 resources: - type: python_script From 49b79386fbab4c5ae01a5defa371f814e5e12f1b Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 13:14:36 +0200 Subject: [PATCH 18/26] Switch segger raster to cKDTree-Voronoi for filled cell masks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pixel-paint only lit each assigned transcript's own pixel, so the output looked like sparse dots even though `process_prediction`'s pixel-lookup still recovered correct cells x genes counts. Switch to a transcript-Voronoi raster: one cKDTree build over the assigned transcripts plus one batched `tree.query(pixels, workers=-1)` parallelizes the per-pixel nearest-neighbor lookup across all cores in C. A density-based distance cutoff (5 * median nearest-neighbor distance, floored at 5px) keeps cells from bleeding into empty space. Same accuracy as pixel-paint for transcript-lookup metrics (each transcript is closest to itself, so its own pixel lookup returns its own cell), but now produces filled cell footprints — usable for visualization and any future shape-based metric. Model invocation is unchanged: segger still installs from dpeerlab/segger@main with the same CLI flags. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 97 ++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 32 deletions(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 46d25f0..853a3fb 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -248,34 +248,66 @@ def _run_segger(xenium_dir: Path, output_dir: Path) -> Path: return pq -def _rasterize_assigned_transcripts( - assigned_tx: pd.DataFrame, image_element +_PIXELS_PER_QUERY_CHUNK = 20_000_000 # cap per-chunk memory at ~160 MB of int64 + + +def _rasterize_via_transcript_voronoi( + assigned_tx: pd.DataFrame, + image_element, + dist_cutoff_factor: float = 5.0, + min_dist_cutoff_px: float = 5.0, ) -> np.ndarray: - """Paint each segger-assigned transcript's pixel with its `segger_cell_id`. - - This is the minimum label image that satisfies the repo contract. - `process_prediction` derives the cells x genes table by indexing the - label image at `(int(t.y), int(t.x))` for every input transcript and - grouping; painting each assigned transcript's truncated pixel with - its cell id makes that lookup return the right cell. Unassigned - transcripts fall on un-painted background (0) and are correctly - excluded by `process_prediction`'s `cell_id != 0` filter. - - O(N) in #assigned transcripts: one affine, one int cast, one fancy - indexed write. No tree, no grid scan, no per-cell loop. - """ + """Rasterize the segmentation by assigning every pixel to the cell of + its nearest segger-assigned transcript (transcript-Voronoi). + + Gives filled cell footprints — not just isolated transcript pixels — + while remaining cheap: one cKDTree build over the assigned transcripts + plus one batched `tree.query(pixels, workers=-1)` that parallelizes + across all cores in C. Equivalent in accuracy to a per-cell Delaunay + alpha-shape for anything `process_prediction` measures (a transcript + is always closest to itself, so its pixel lookup returns its own + cell). + + Pixels farther than `dist_cutoff_factor * median_nearest_neighbor_dist` + (in pixel units, lower-bounded by `min_dist_cutoff_px`) from any + assigned transcript stay 0 — cells don't bleed into empty regions. + + `assigned_tx` must carry columns 'x', 'y', 'segger_cell_id'.""" + from scipy.spatial import cKDTree + H, W = image_element.shape[-2:] M_g2p = _affine_global_to_pixel(image_element) xy_pix = _apply_affine(M_g2p, assigned_tx[["x", "y"]].to_numpy()) - # Match process_prediction's int64() cast (truncation toward zero). - ys = xy_pix[:, 1].astype(np.int64) - xs = xy_pix[:, 0].astype(np.int64) - inside = (ys >= 0) & (ys < H) & (xs >= 0) & (xs < W) - ys = ys[inside] - xs = xs[inside] - cell_ids = assigned_tx["segger_cell_id"].to_numpy(dtype=np.int64)[inside] + cell_ids = assigned_tx["segger_cell_id"].to_numpy(dtype=np.int64) + + tree = cKDTree(xy_pix) + + sample = min(20_000, len(xy_pix)) + if sample < 2: + d_cutoff = float(min_dist_cutoff_px) + else: + rng = np.random.default_rng(0) + idx = rng.choice(len(xy_pix), sample, replace=False) + nn_d, _ = tree.query(xy_pix[idx], k=2, workers=-1) + median_nn = float(np.median(nn_d[:, 1])) + d_cutoff = max(dist_cutoff_factor * median_nn, float(min_dist_cutoff_px)) + print( + f"transcript-Voronoi raster: {len(xy_pix)} transcripts, " + f"{H}x{W} grid, d_cutoff={d_cutoff:.2f}px", + flush=True, + ) + labels = np.zeros((H, W), dtype=np.uint32) - labels[ys, xs] = cell_ids.astype(np.uint32) + rows_per_chunk = max(1, _PIXELS_PER_QUERY_CHUNK // max(W, 1)) + xs_row = np.arange(W) + for y0 in range(0, H, rows_per_chunk): + y1 = min(H, y0 + rows_per_chunk) + ys = np.arange(y0, y1) + gx, gy = np.meshgrid(xs_row, ys) + pts = np.column_stack([gx.ravel(), gy.ravel()]) + d, idx = tree.query(pts, k=1, workers=-1) + chunk = np.where(d > d_cutoff, 0, cell_ids[idx]).astype(np.uint32) + labels[y0:y1, :] = chunk.reshape(y1 - y0, W) return labels @@ -342,13 +374,14 @@ def _rasterize_assigned_transcripts( ) print(f"kept assignments: {seg.height}", flush=True) -# --- Step 3: build the minimal label image required by the repo --- -# README spec is: labels.segmentation + tables.table with -# uns.dataset_id and uns.method_id. process_prediction derives the -# cells x genes table itself by `label_image[int(t.y), int(t.x)]` for -# every input transcript. So the smallest valid label image is one -# where each segger-assigned transcript's truncated pixel carries its -# segger_cell_id; everything else stays background. +# --- Step 3: rasterize segger's per-cell footprint --- +# Build the label image as a transcript-Voronoi: every pixel is the +# cell of the nearest segger-assigned transcript, with a density-based +# distance cutoff so cells don't bleed into empty regions. Cheap (one +# cKDTree build + one batched parallel query) and gives filled cell +# masks instead of sparse transcript dots. Equivalent in accuracy to a +# per-cell Delaunay alpha-shape for what `process_prediction`'s pixel +# lookup measures. dataset_id = sdata.tables["table"].uns["dataset_id"] if seg.height == 0: @@ -367,11 +400,11 @@ def _rasterize_assigned_transcripts( "segger_cell_id": np.asarray(segger_cell, dtype=np.int64), }) print( - f"painting {len(assigned_tx)} assigned transcripts across " + f"rasterizing {len(assigned_tx)} assigned transcripts across " f"{assigned_tx['segger_cell_id'].nunique()} cells", flush=True, ) - final_labels = _rasterize_assigned_transcripts(assigned_tx, image_el) + final_labels = _rasterize_via_transcript_voronoi(assigned_tx, image_el) final_labels = _to_lower_uint(final_labels) From 113079d23b7350a5783fcc231804dbbb1b6dbda1 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 14:23:56 +0200 Subject: [PATCH 19/26] Replace segger's opencv-python with headless wheel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit segger.io.utils imports cv2 at module load. dpeerlab/segger@main pins the GUI build (`opencv-python`) in its pyproject.toml, which dlopens libxcb.so.1 — the NGC pytorch:25.06-py3 base image ships no X11 stack so `segger segment` crashed at startup with `ImportError: libxcb.so.1: cannot open shared object file`. Right after the segger install, uninstall any GUI opencv variants and install opencv-python-headless. Same `cv2` module name, no X11/GTK deps. No code changes required. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/config.vsh.yaml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 57c568f..5c41054 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -111,10 +111,16 @@ engines: # need the spatialdata-loader branch — segger@main is enough. - type: python github: dpeerlab/segger - # - type: docker - # run: | - # pip install --no-cache-dir \ - # "git+https://github.com/dpeerlab/segger.git@main#egg=segger" + # Replace segger's GUI opencv-python dependency with the headless + # wheel. segger.io.utils does `import cv2` which dlopens libxcb.so.1; + # the NGC pytorch base image ships no X11 stack and the original + # opencv-python wheel fails with ImportError at every CLI startup. + # opencv-python-headless exposes the same `cv2` module without the + # X11/GTK deps, so the import resolves and nothing else changes. + - type: docker + run: | + pip uninstall -y opencv-python opencv-contrib-python 2>/dev/null || true + pip install --no-cache-dir opencv-python-headless # __merge__: /src/base/setup_spatialdata_partial.yaml - type: native From a809166a16b2486a9a6922894234952911c0d86b Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 14:42:44 +0200 Subject: [PATCH 20/26] add missing packages Signed-off-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 5c41054..72394af 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -74,6 +74,8 @@ engines: packages: - procps - git + - libxcb1 + - libgl1 - type: python packages: - anndata~=0.12.0 From 8bdad1f7bd590b6a9791441fc67fda70a0a9aa37 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 14:43:46 +0200 Subject: [PATCH 21/26] fix config Signed-off-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 72394af..29b1dd8 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -120,9 +120,9 @@ engines: # opencv-python-headless exposes the same `cv2` module without the # X11/GTK deps, so the import resolves and nothing else changes. - type: docker - run: | - pip uninstall -y opencv-python opencv-contrib-python 2>/dev/null || true - pip install --no-cache-dir opencv-python-headless + run: + - pip uninstall -y opencv-python opencv-contrib-python 2>/dev/null || true + - pip install --no-cache-dir opencv-python-headless # __merge__: /src/base/setup_spatialdata_partial.yaml - type: native From e4d6a330037f82f288ae7175245a4a368f922096 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 14:44:22 +0200 Subject: [PATCH 22/26] removed unneeded deps? Signed-off-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index 29b1dd8..4fa6592 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -74,8 +74,6 @@ engines: packages: - procps - git - - libxcb1 - - libgl1 - type: python packages: - anndata~=0.12.0 From 9dd0a20a40d98e25edd6acff1bafd07ee35d6b7c Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Tue, 19 May 2026 15:24:47 +0200 Subject: [PATCH 23/26] revert cv2 issues Signed-off-by: Robrecht Cannoodt --- src/methods/segger/config.vsh.yaml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/methods/segger/config.vsh.yaml b/src/methods/segger/config.vsh.yaml index a41d94f..58fc585 100644 --- a/src/methods/segger/config.vsh.yaml +++ b/src/methods/segger/config.vsh.yaml @@ -75,6 +75,7 @@ engines: - procps - git - libxcb1 + - libgl1 - type: python packages: - anndata~=0.12.0 @@ -112,16 +113,6 @@ engines: # need the spatialdata-loader branch — segger@main is enough. - type: python github: dpeerlab/segger - # Replace segger's GUI opencv-python dependency with the headless - # wheel. segger.io.utils does `import cv2` which dlopens libxcb.so.1; - # the NGC pytorch base image ships no X11 stack and the original - # opencv-python wheel fails with ImportError at every CLI startup. - # opencv-python-headless exposes the same `cv2` module without the - # X11/GTK deps, so the import resolves and nothing else changes. - - type: docker - run: - - pip uninstall -y opencv-python opencv-contrib-python 2>/dev/null || true - - pip install --no-cache-dir opencv-python-headless # __merge__: /src/base/setup_spatialdata_partial.yaml - type: native From c22dc4b0642fe49b75231ffd87fa221eae005060 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 16:08:20 +0200 Subject: [PATCH 24/26] Write boundary cell_id as string for segger's preprocessor segger.io.preprocessor builds boundary row indexes as `bd[id] + '_' + bd[boundary_type].map({...})`, which requires the id column to be string. We were writing it as int64, so segger crashed with `numpy ufunc 'add' did not contain a loop with signature matching types (dtype('int64'), dtype(' --- src/methods/segger/script.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 853a3fb..e3349f4 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -150,13 +150,19 @@ def _rasterize_polygons( def _shapes_to_xenium_vertices(shapes_gdf: gpd.GeoDataFrame) -> pl.DataFrame: """Xenium boundary parquet schema: one row per polygon vertex with - columns (cell_id, vertex_x, vertex_y).""" + columns (cell_id, vertex_x, vertex_y). + + `cell_id` is written as a string to match the native Xenium schema — + segger's preprocessor composes a per-row index like + `cell_id + '_' + boundary_type` and crashes with a numpy `add` ufunc + error if `cell_id` arrives as int64.""" rows = [] for cid, geom in zip(shapes_gdf["cell_id"].to_numpy(), shapes_gdf.geometry.to_numpy()): if geom is None or geom.is_empty or geom.geom_type != "Polygon": continue + cid_str = str(cid) for vx, vy in np.asarray(geom.exterior.coords): - rows.append((int(cid), float(vx), float(vy))) + rows.append((cid_str, float(vx), float(vy))) return pl.DataFrame(rows, schema=["cell_id", "vertex_x", "vertex_y"], orient="row") From 2baf1796a6464e0f3e19600dd1a192ae0b3d3226 Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 16:28:44 +0200 Subject: [PATCH 25/26] Trust per-transcript cell_id prior directly; fail loud on empty signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rasterize-then-pixel-lookup roundtrip in _build_xenium_layout was silently losing every training signal whenever the prior's convex hulls were too small to render pixels: each transcript looked up its pixel in the rasterized mask, got 0 (background), and was written as "UNASSIGNED". segger's anndata builder then crashed with `cannot infer dimensions from zero sized index arrays` because it had zero non-null cell_ids to build a cells x genes matrix from. For the transcript_cell_id mode the input already carries a per-transcript cell_id prior, so write it directly via the new tx_cell_id_override argument. Cellpose mode still uses the rasterize-and-lookup path because it has no per-transcript prior. Add a sanity check after writing transcripts.parquet: count how many rows came out assigned vs UNASSIGNED, print the breakdown, and raise a clear RuntimeError before launching segger if the training signal is empty — surfaces problems before they become opaque tracebacks deep inside segger's pipeline. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 72 +++++++++++++++++++++++++++++++++--- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index e3349f4..16ce60f 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -185,16 +185,39 @@ def _build_xenium_layout( initial_labels: np.ndarray, image_element, out_dir: Path, + tx_cell_id_override: np.ndarray | None = None, ) -> Path: """Materialize a directory that segger's `10x_xenium` preprocessor can consume: `transcripts.parquet` + matching `cell_boundaries.parquet` / `nucleus_boundaries.parquet`. We don't have a real cell vs nucleus split, so the same polygon set is written to both — segger then drives the - prediction graph off whichever one `--prediction-mode` selects.""" + prediction graph off whichever one `--prediction-mode` selects. + + `tx_cell_id_override` is the per-transcript training assignment as a + string array ("UNASSIGNED" sentinel for unknown). When supplied (the + `transcript_cell_id` init mode, which carries a vendor cell_id prior + on the transcripts) we skip the rasterize-then-pixel-lookup roundtrip + and trust the prior directly — otherwise tiny convex hulls would + rasterize to zero pixels and silently mark every transcript as + UNASSIGNED, which made segger's anndata builder explode on empty input. + When omitted (the Cellpose path) we derive assignments by looking + each transcript up in the rasterized nucleus mask as before.""" out_dir.mkdir(parents=True, exist_ok=True) - tx_init = _initial_cell_id_per_transcript(tx_pd, initial_labels, image_element) - cell_id_str = np.where(tx_init > 0, tx_init.astype(str), "UNASSIGNED") + if tx_cell_id_override is not None: + cell_id_str = np.asarray(tx_cell_id_override, dtype=object) + cell_id_str = np.where( + cell_id_str == None, # noqa: E711 — np.where needs ==, not `is` + "UNASSIGNED", + cell_id_str.astype(str), + ) + overlaps_nucleus = (cell_id_str != "UNASSIGNED").astype(np.int8) + else: + tx_init = _initial_cell_id_per_transcript( + tx_pd, initial_labels, image_element + ) + cell_id_str = np.where(tx_init > 0, tx_init.astype(str), "UNASSIGNED") + overlaps_nucleus = (tx_init > 0).astype(np.int8) tx_out = pd.DataFrame({ "transcript_id": ( @@ -214,7 +237,7 @@ def _build_xenium_layout( # Treat the entire mask as nucleus (overlaps_nucleus=1); transcripts # off the mask get 0. segger only uses this when prediction_mode # distinguishes cell vs nucleus. - "overlaps_nucleus": (tx_init > 0).astype(np.int8), + "overlaps_nucleus": overlaps_nucleus, }) if "z" in tx_pd.columns: tx_out.insert(3, "z_location", tx_pd["z"].to_numpy().astype(np.float32)) @@ -346,9 +369,20 @@ def _rasterize_via_transcript_voronoi( initial_labels: np.ndarray shapes_gdf: gpd.GeoDataFrame +tx_cell_id_override: np.ndarray | None = None if mode == "transcript_cell_id": shapes_gdf = _polygons_from_cell_ids(tx_pd) initial_labels = _rasterize_polygons(shapes_gdf, image_el, label_col="cell_id") + # Trust the input's per-transcript cell_id prior directly for segger's + # training signal — converting via rasterize-then-pixel-lookup loses + # transcripts whose cell's convex hull rasterizes to zero pixels (tiny + # hulls / off-grid centroids) and silently marks them UNASSIGNED. + valid = _valid_cell_id_mask(tx_pd["cell_id"]) + tx_cell_id_override = np.where( + valid.to_numpy(), + tx_pd["cell_id"].astype(str).to_numpy(), + "UNASSIGNED", + ) elif mode == "cellpose": initial_labels, shapes_gdf = _polygons_from_cellpose(image_el, par["cellpose_diameter"]) else: @@ -362,7 +396,35 @@ def _rasterize_via_transcript_voronoi( # initial mask id as `cell_id` (segger's training signal), and the polygons # are written to both cell_boundaries.parquet and nucleus_boundaries.parquet # since segger expects both Xenium files to exist. -_build_xenium_layout(tx_pd, shapes_gdf, initial_labels, image_el, xenium_dir) +_build_xenium_layout( + tx_pd, + shapes_gdf, + initial_labels, + image_el, + xenium_dir, + tx_cell_id_override=tx_cell_id_override, +) + +# Fail loudly with diagnostics if no transcript carries a valid cell +# assignment — segger's anndata builder will crash with an opaque +# `cannot infer dimensions from zero sized index arrays` otherwise. +_tx_written = pd.read_parquet(xenium_dir / "transcripts.parquet", columns=["cell_id"]) +_n_assigned = (_tx_written["cell_id"].astype(str) != "UNASSIGNED").sum() +_n_total = len(_tx_written) +_n_cells = _tx_written.loc[ + _tx_written["cell_id"].astype(str) != "UNASSIGNED", "cell_id" +].nunique() +print( + f"transcripts.parquet: {_n_assigned}/{_n_total} assigned across " + f"{_n_cells} cells", + flush=True, +) +if _n_assigned == 0 or _n_cells == 0: + raise RuntimeError( + "No transcripts carry a valid cell_id assignment — segger has no " + "training signal. Check that the input transcripts have a usable " + "cell_id prior, or rerun with --init_segmentation cellpose." + ) seg_pq = _run_segger(xenium_dir, segger_out_dir) seg = pl.read_parquet(seg_pq) print(f"segger emitted {seg.height} rows", flush=True) From f76c8ab00b68fdcfefa5f951cc8aec675fe39b4c Mon Sep 17 00:00:00 2001 From: Elihei2 Date: Tue, 19 May 2026 16:37:37 +0200 Subject: [PATCH 26/26] Drop transcripts whose cell has no boundary polygon segger's setup_anndata left-joins obs (cell_ids from transcripts.parquet) against the boundaries dataframe by cell_id and then asserts every obs row matched. _polygons_from_cell_ids drops cells with < 3 transcripts (can't make a convex hull), so transcripts in those tiny cells used to ship to segger with a cell_id that had no matching boundary -> NaN obs index -> `AssertionError`. Compute the set of cell_ids that actually have a polygon in shapes_gdf and mark any transcript whose cell didn't make the cut as "UNASSIGNED". Transcripts with valid (cell_id, boundary) pairs keep their assignment as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/methods/segger/script.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/methods/segger/script.py b/src/methods/segger/script.py index 16ce60f..3a7e797 100644 --- a/src/methods/segger/script.py +++ b/src/methods/segger/script.py @@ -375,12 +375,20 @@ def _rasterize_via_transcript_voronoi( initial_labels = _rasterize_polygons(shapes_gdf, image_el, label_col="cell_id") # Trust the input's per-transcript cell_id prior directly for segger's # training signal — converting via rasterize-then-pixel-lookup loses - # transcripts whose cell's convex hull rasterizes to zero pixels (tiny - # hulls / off-grid centroids) and silently marks them UNASSIGNED. + # transcripts whose cell's convex hull rasterizes to zero pixels. + # Critically: a transcript's cell_id is only valid if that cell ALSO + # has a boundary polygon in shapes_gdf. _polygons_from_cell_ids skips + # cells with < 3 transcripts (can't make a hull), and segger's + # setup_anndata left-joins obs against boundaries by cell_id — any + # cell present in transcripts but absent from boundaries produces + # NaN obs index and trips the `assert ~obs.index.isna().any()`. + cell_ids_with_boundary = set(shapes_gdf["cell_id"].astype(str).to_numpy()) valid = _valid_cell_id_mask(tx_pd["cell_id"]) + tx_cid_str = tx_pd["cell_id"].astype(str).to_numpy() + has_boundary = np.array([c in cell_ids_with_boundary for c in tx_cid_str]) tx_cell_id_override = np.where( - valid.to_numpy(), - tx_pd["cell_id"].astype(str).to_numpy(), + valid.to_numpy() & has_boundary, + tx_cid_str, "UNASSIGNED", ) elif mode == "cellpose":