diff --git a/Cargo.toml b/Cargo.toml index 28fe422..b4a43e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ either = "1" eyre = "0.6.12" globset = "0.4.18" gnuplot = "0.0.46" +graphrs = "0.11" hex = { version = "0.4.3", features = ["serde"] } indexmap = "2" itertools = "0.14.0" @@ -35,7 +36,8 @@ mermaid-rs-renderer = { version = "0.1.2", default-features = false } itertools-num = "0.1.3" kernel-density-estimation = "0.2.0" ordered-float = "5.0.0" -petgraph = "0.7" +petgraph = "0.8" +rand = "0.9" regex = "1" pathdiff = "0.2.3" serde = { version = "1.0.217", features = ["derive"] } diff --git a/README.md b/README.md index 60efbc1..ec4c0a0 100644 --- a/README.md +++ b/README.md @@ -10,19 +10,21 @@ Gizmos for working with CSVs * [csvplot](#csvplot) -- line and scatter plots from CSV files * [csvstats](#csvstats) -- histograms and summary statistics for CSV files * [csvcat](#csvcat) -- concatenate CSV files -* [can2k](#can2k) -- parse NMEA 2000 GPS data from CAN logs -* [qgsdir](#qgsdir) -- generate QGIS projects from directories of CSV files * [csvdelta](#csvdelta) -- calculate inter-row deltas for CSV files -* [minpath](#minpath) -- shorten file paths to minimal unique suffixes -* [depconv](#depconv) -- convert dependency graphs between formats -* [depfilter](#depfilter) -- filter or select subsets of dependency graphs -* [deptransform](#deptransform) -- transform dependency graphs +* [can2k](#can2k) -- parse NMEA 2000 GPS data from CAN logs * [can2csv](#can2csv) -- parse CAN logs into CSV files * [canspam](#canspam) -- generate random CAN traffic * [canstruct](#canstruct) -- reconstruct NMEA 2000 Fast Packet / ISO 11783-3 Transport Protocol sessions +* [qgsdir](#qgsdir) -- generate QGIS projects from directories of CSV files +* [depconv](#depconv) -- convert dependency graphs between formats +* [depfilter](#depfilter) -- filter or select subsets of dependency graphs +* [deptransform](#deptransform) -- transform dependency graphs * [depquery](#depquery) -- query properties of dependency graphs +* [depcluster](#depcluster) -- cluster dependency graphs using community detection +* [graphdiff](#graphdiff) -- compare two dependency graphs * [bbclasses](#bbclasses) -- generate BitBake recipe inheritance diagrams +* [minpath](#minpath) -- shorten file paths to minimal unique suffixes # Philosophy @@ -123,6 +125,27 @@ foo,bar,baz 7,8,9 ``` +## csvdelta + +Calculate the inter-row deltas for a CSV column. Useful for understanding the time between events. +Also supports mean-centering a column, or centering it around a specific value. + +```sh +$ csvdelta --column foo < > If you want to use `can2k` together with `qgsdir`, you need to use `can2k --wkt`. -## qgsdir +## can2csv -Generate a QGIS project from a directory of CSV layer files. Each CSV file is assumed to have a -column of WKT geometries named `geometry` (QGIS's geometry heuristics don't appear to be exposed via -their Python API). +Parse basic data from a CAN frame into a CSV record. Faster than `sed`, and also parses the canid. +Useful in conjunction with `csvdelta` to understand message timing. + +`can2csv` is not a real CAN parser, and does not understand any of the data transmitted via CAN. ```sh -$ can2k --wkt ./data/n2k-sample.log ./data/n2k.csv -$ qgsdir --open ./data/n2k.csv +$ head -n 3 data/candump-random-data.log | can2csv +timestamp,interface,canid,dlc,priority,src,dst,pgn,data +1739229594.465994,can0,0xE9790B5,8,3,0xB5,0x90,0x29700,CA3F871A5A6EE75F +1739229594.467052,can0,0xD15F192,8,3,0x92,0xF1,0x11500,500B3766CB2DED7C ``` -You may pass directories or files. If you pass a directory, the script is able to group layers by -subdirectory, leading to an easier-to-use layer tree. +If you pass `--reconstruct`, then `can2csv` will reconstruct any transport layer sessions it can +understand. Right now that's just NMEA 2000 Fast Packet, but ISO-11783 Transport Protocol is +planned. -## csvdelta +## canspam -Calculate the inter-row deltas for a CSV column. Useful for understanding the time between events. -Also supports mean-centering a column, or centering it around a specific value. +The [canspam](./scripts/canspam) script can generate random CAN traffic on a Linux CAN device. It's +useful for inflating busload, or for generating random traffic to test `can2csv` against ;) + +## canstruct + +The `canstruct` tool is a NMEA 2000 Fast Packet / ISO 11783-3 Transport Protocol transport session +reconstruction tool. That is, you give it the individual 8-byte frames, and it gives you the +reconstructed messages. ```sh -$ csvdelta --column foo < [!NOTE] -> -> DOT parsing requires building with `--features dot`, which pulls in the `dot-parser` crate -> (GPL-2.0). The default build does not include this feature and is MIT-licensed. When built with -> `--features dot`, the resulting binary is GPL-2.0. DOT _emitting_ is always available (custom -> string formatting, no GPL dependency). - ## depfilter Filter or select subsets of dependency graphs. Works on the same formats as `depconv`, and is @@ -282,11 +304,6 @@ digraph { } ``` -> [!NOTE] -> -> The `depfilter` tool shares the same GPL-2.0 license caveat as `depconv` with respect to DOT -> parsing. - ## deptransform Structural transformations on dependency graphs. Works on the same formats as `depconv`, and is @@ -309,11 +326,6 @@ $ cat data/depconv/bitbake.curl.task-depends.dot | deptransform sub --key=node:label 's/.*//' | ``` -> [!NOTE] -> -> The `deptransform` tool shares the same GPL-2.0 license caveat as `depconv` with respect to DOT -> parsing. - ## depquery Query properties of dependency graphs. Lists nodes, edges, and computes graph metrics. Supports the @@ -333,55 +345,73 @@ tracing-subscriber 10 The `depquery` tool supports outputting `nodes`, `edges`, and `metrics`. The output is intended to be machine-readable, and is tab-separated. -> [!NOTE] -> -> The `depquery` tool shares the same GPL-2.0 license caveat as `depconv` with respect to DOT -> parsing. - -## can2csv - -Parse basic data from a CAN frame into a CSV record. Faster than `sed`, and also parses the canid. -Useful in conjunction with `csvdelta` to understand message timing. +## depcluster -`can2csv` is not a real CAN parser, and does not understand any of the data transmitted via CAN. +Run community detection on a dependency graph to identify clusters of related nodes. Each cluster +becomes a subgraph in the output, with cross-cluster edges at the top level. Supports Louvain +(default), Leiden, and Label Propagation algorithms. ```sh -$ head -n 3 data/candump-random-data.log | can2csv -timestamp,interface,canid,dlc,priority,src,dst,pgn,data -1739229594.465994,can0,0xE9790B5,8,3,0xB5,0x90,0x29700,CA3F871A5A6EE75F -1739229594.467052,can0,0xD15F192,8,3,0x92,0xF1,0x11500,500B3766CB2DED7C +$ echo -e "a b\na c\nb c\nd e\nd f\ne f\n#\na b\na c\nb c\nd e\nd f\ne f" | + depcluster -I tgf -O mermaid ``` -If you pass `--reconstruct`, then `can2csv` will reconstruct any transport layer sessions it can -understand. Right now that's just NMEA 2000 Fast Packet, but ISO-11783 Transport Protocol is -planned. +```mermaid +flowchart LR + subgraph cluster_0 + a + b + c + a --> b + a --> c + b --> c + end + subgraph cluster_1 + d + e + f + d --> e + d --> f + e --> f + end +``` -## canspam +## graphdiff -The [canspam](./scripts/canspam) script can generate random CAN traffic on a Linux CAN device. It's -useful for inflating busload, or for generating random traffic to test `can2csv` against ;) +Compare two dependency graphs and report what changed. Nodes are matched by ID, and edges by their +endpoints. -## canstruct +`graphdiff` supports several subcommands: -The `canstruct` tool is a NMEA 2000 Fast Packet / ISO 11783-3 Transport Protocol transport session -reconstruction tool. That is, you give it the individual 8-byte frames, and it gives you the -reconstructed messages. +* `graphdiff annotate` -- output the combined graph with changes highlighted (added, removed, + changed nodes/edges get distinct attributes) +* `graphdiff list` -- tab-delimited list of changes (`+` added, `-` removed, `~` changed, `>` moved) +* `graphdiff summary` -- tab-delimited counts of each change type +* `graphdiff subtract` -- set difference: nodes and edges only in the first graph ```sh -$ cat data/abort-then-full.log -(1750963033.251412) can0 18EC2A1C#101600040400EF00 // TP.CM_RTS -(1750963033.270725) can0 18EC1C2A#FF01FFFFFF00EF00 // TP.Conn_Abort -(1750963079.757877) can0 18EC2A1C#101600040400EF00 // TP.CM_RTS -(1750963079.775206) can0 18EC1C2A#110401FFFF00EF00 // TP.CM_CTS -(1750963079.778342) can0 14EB2A1C#0111111111111111 // TP.DT -(1750963079.779468) can0 14EB2A1C#0222222222222222 // TP.DT -(1750963079.780613) can0 14EB2A1C#0333333333333333 // TP.DT -(1750963079.781778) can0 14EB2A1C#0444FFFFFFFFFFFF // TP.DT -(1750963079.795905) can0 18EC1C2A#13160004FF00EF00 // TP.CM_EndofMsgACK +$ cat before.tgf +a Alpha +b Bravo +# +a b -$ canstruct data/abort-then-full.log -2025-06-28T15:36:19.051620Z WARN csvizmo::can::tp: TP.CM_ABRT 0x1C <- 0x2A reason ExistingTransportSession pgn 0xEF00 -(1750963079.795905) can0 18EF2A1C#11111111111111222222222222223333333333333344 +$ cat after.tgf +b Bravo +c Charlie +# +b c + +$ graphdiff annotate before.tgf after.tgf -O mermaid +``` + +```mermaid +flowchart LR + b["Bravo"] + c["+ Charlie"] + a["- Alpha"] + b --> c + a --> b ``` ## bbclasses @@ -453,3 +483,22 @@ flowchart LR poky/meta/conf/distro/include/ptest-packagelists.inc -->|"require"| poky/meta/classes-recipe/ptest.bbclass poky/meta/recipes-support/curl/curl_8.7.1.bb -->|"appends"| meta-work/recipes-support/curl/curl__.bbappend ``` + +## minpath + +Shorten file paths to the minimal unique suffix. Useful for displaying lists of files in a compact +way while keeping them distinguishable. + +```sh +$ minpath < eyre::Result> { + let specs = if directed { + GraphSpecs::directed_create_missing() + } else { + GraphSpecs::undirected_create_missing() + }; + + let all_nodes = graph.all_nodes(); + let all_edges = graph.all_edges(); + + let nodes: Vec<_> = all_nodes + .keys() + .map(|id| Node::from_name(id.clone())) + .collect(); + + let edges: Vec<_> = all_edges + .iter() + .map(|e| Edge::new(e.from.clone(), e.to.clone())) + .collect(); + + let g = Graph::new_from_nodes_and_edges(nodes, edges, specs) + .map_err(|e| eyre::eyre!("graphrs error: {e}"))?; + + Ok(g) +} + +/// Convert graphrs community result (`Vec>`) to our partition format. +fn communities_to_partition(graph: &DepGraph, communities: Vec>) -> Vec> { + let all_nodes = graph.all_nodes(); + communities + .iter() + .map(|community| { + let mut ids: Vec<&str> = all_nodes + .keys() + .filter(|id| community.contains(id.as_str())) + .map(|id| id.as_str()) + .collect(); + ids.sort(); + ids + }) + .collect() +} + +/// Run the Louvain community detection algorithm on the dependency graph. +pub fn louvain_clustering( + graph: &DepGraph, + directed: bool, + resolution: f64, + seed: Option, +) -> eyre::Result { + let g = depgraph_to_graphrs(graph, directed)?; + + let communities = louvain::louvain_communities(&g, false, Some(resolution), None, seed) + .map_err(|e| eyre::eyre!("louvain error: {e}"))?; + + let partition = communities_to_partition(graph, communities); + Ok(clusters_to_depgraph(graph, &partition)) +} + +/// Run the Leiden community detection algorithm on the dependency graph. +pub fn leiden_clustering( + graph: &DepGraph, + directed: bool, + resolution: f64, +) -> eyre::Result { + let g = depgraph_to_graphrs(graph, directed)?; + + let communities = leiden::leiden( + &g, + false, + leiden::QualityFunction::CPM, + Some(resolution), + None, + None, + ) + .map_err(|e| eyre::eyre!("leiden error: {e}"))?; + + let partition = communities_to_partition(graph, communities); + Ok(clusters_to_depgraph(graph, &partition)) +} diff --git a/crates/csvizmo-depgraph/src/algorithm/cluster/lpa.rs b/crates/csvizmo-depgraph/src/algorithm/cluster/lpa.rs new file mode 100644 index 0000000..c949660 --- /dev/null +++ b/crates/csvizmo-depgraph/src/algorithm/cluster/lpa.rs @@ -0,0 +1,161 @@ +use std::collections::HashMap; + +use rand::SeedableRng; +use rand::prelude::SliceRandom; +use rand::rngs::StdRng; + +use super::{Adjacency, clusters_to_depgraph}; +use crate::{DepGraph, FlatGraphView}; + +/// Run Label Propagation Algorithm on the dependency graph. +/// +/// Each node starts in its own cluster. Each iteration, nodes adopt the most common +/// cluster label among their neighbors (ties broken by smallest label). Stops when +/// no labels change or `max_iter` is reached. +/// +/// If `seed` is provided, the node processing order is shuffled each iteration. +/// Otherwise, nodes are processed in graph order (deterministic). +pub fn lpa(graph: &DepGraph, directed: bool, max_iter: usize, seed: Option) -> DepGraph { + let view = FlatGraphView::new(graph); + let n = view.idx_to_id.len(); + + if n == 0 { + return DepGraph::default(); + } + + let adj = Adjacency::new(&view, directed); + + // Each node starts with its own label (index). + let mut labels: Vec = (0..n).collect(); + + let mut rng = seed.map(StdRng::seed_from_u64); + let mut order: Vec = (0..n).collect(); + + for _ in 0..max_iter { + if let Some(rng) = rng.as_mut() { + order.shuffle(rng); + } + + let mut changed = false; + for &i in &order { + let neighbors = &adj.neighbors[i]; + if neighbors.is_empty() { + continue; + } + + // Count neighbor labels. + let mut counts: HashMap = HashMap::new(); + for &neighbor in neighbors { + *counts.entry(labels[neighbor]).or_default() += 1; + } + + // Find most common label; ties broken by smallest label. + let mut best_label = labels[i]; + let mut best_count = 0; + for (&label, &count) in &counts { + if count > best_count || (count == best_count && label < best_label) { + best_label = label; + best_count = count; + } + } + + if best_label != labels[i] { + labels[i] = best_label; + changed = true; + } + } + + if !changed { + break; + } + } + + // Convert label assignments to partition. + let mut cluster_map: HashMap> = HashMap::new(); + for (i, &label) in labels.iter().enumerate() { + cluster_map + .entry(label) + .or_default() + .push(view.idx_to_id[i]); + } + + // Sort clusters by their smallest label for deterministic output. + let mut clusters: Vec<(usize, Vec<&str>)> = cluster_map.into_iter().collect(); + clusters.sort_by_key(|(label, _)| *label); + let partition: Vec> = clusters.into_iter().map(|(_, ids)| ids).collect(); + + clusters_to_depgraph(graph, &partition) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{Edge, NodeInfo}; + + fn make_graph(nodes: &[(&str, &str)], edges: &[(&str, &str)]) -> DepGraph { + DepGraph { + nodes: nodes + .iter() + .map(|(id, label)| (id.to_string(), NodeInfo::new(*label))) + .collect(), + edges: edges + .iter() + .map(|(from, to)| Edge { + from: from.to_string(), + to: to.to_string(), + ..Default::default() + }) + .collect(), + ..Default::default() + } + } + + #[test] + fn two_disconnected_components() { + let g = make_graph( + &[("a", "a"), ("b", "b"), ("c", "c"), ("d", "d")], + &[("a", "b"), ("c", "d")], + ); + let result = lpa(&g, false, 100, None); + assert_eq!(result.subgraphs.len(), 2); + // No cross-cluster edges + assert!(result.edges.is_empty()); + } + + #[test] + fn single_clique() { + // Fully connected: a-b, b-c, a-c -- should all be in one cluster + let g = make_graph( + &[("a", "a"), ("b", "b"), ("c", "c")], + &[ + ("a", "b"), + ("b", "c"), + ("a", "c"), + ("b", "a"), + ("c", "b"), + ("c", "a"), + ], + ); + let result = lpa(&g, false, 100, None); + assert_eq!(result.subgraphs.len(), 1); + assert_eq!(result.subgraphs[0].nodes.len(), 3); + } + + #[test] + fn empty_graph() { + let g = DepGraph::default(); + let result = lpa(&g, false, 100, None); + assert!(result.subgraphs.is_empty()); + assert!(result.edges.is_empty()); + } + + #[test] + fn with_seed() { + let g = make_graph( + &[("a", "a"), ("b", "b"), ("c", "c"), ("d", "d")], + &[("a", "b"), ("c", "d")], + ); + let result = lpa(&g, false, 100, Some(42)); + assert_eq!(result.subgraphs.len(), 2); + } +} diff --git a/crates/csvizmo-depgraph/src/algorithm/cluster/mod.rs b/crates/csvizmo-depgraph/src/algorithm/cluster/mod.rs new file mode 100644 index 0000000..93d2dd2 --- /dev/null +++ b/crates/csvizmo-depgraph/src/algorithm/cluster/mod.rs @@ -0,0 +1,189 @@ +pub mod graphrs_bridge; +pub mod lpa; + +use std::collections::HashMap; + +use indexmap::IndexMap; +use petgraph::Direction; + +use crate::{DepGraph, Edge, FlatGraphView, NodeInfo}; + +/// Precomputed neighbor lists from a flattened dependency graph. +/// +/// In undirected mode, neighbors include both incoming and outgoing edges (deduplicated). +/// In directed mode, only outgoing neighbors are included. +pub struct Adjacency { + /// For each node index, the set of neighbor node indices. + pub neighbors: Vec>, +} + +impl Adjacency { + pub fn new(view: &FlatGraphView, directed: bool) -> Self { + let n = view.idx_to_id.len(); + let mut neighbors = vec![Vec::new(); n]; + + for idx in view.pg.node_indices() { + let i = idx.index(); + let mut seen = Vec::new(); + + for neighbor in view.pg.neighbors_directed(idx, Direction::Outgoing) { + seen.push(neighbor.index()); + } + + if !directed { + for neighbor in view.pg.neighbors_directed(idx, Direction::Incoming) { + if !seen.contains(&neighbor.index()) { + seen.push(neighbor.index()); + } + } + } + + neighbors[i] = seen; + } + + Adjacency { neighbors } + } +} + +/// Convert a partition (list of clusters, each a list of node IDs) into a DepGraph with +/// one subgraph per cluster. Intra-cluster edges go in the subgraph; cross-cluster edges +/// go at the top level. +pub fn clusters_to_depgraph(graph: &DepGraph, partition: &[Vec<&str>]) -> DepGraph { + let all_nodes = graph.all_nodes(); + let all_edges = graph.all_edges(); + + // Map each node ID to its cluster index. + let mut node_to_cluster: HashMap<&str, usize> = HashMap::new(); + for (i, cluster) in partition.iter().enumerate() { + for &id in cluster { + node_to_cluster.insert(id, i); + } + } + + let mut subgraphs = Vec::new(); + for (i, cluster) in partition.iter().enumerate() { + let cluster_ids: std::collections::HashSet<&str> = cluster.iter().copied().collect(); + + let nodes: IndexMap = all_nodes + .iter() + .filter(|(id, _)| cluster_ids.contains(id.as_str())) + .map(|(id, info)| (id.clone(), info.clone())) + .collect(); + + let edges: Vec = all_edges + .iter() + .filter(|e| { + cluster_ids.contains(e.from.as_str()) && cluster_ids.contains(e.to.as_str()) + }) + .cloned() + .collect(); + + subgraphs.push(DepGraph { + id: Some(format!("cluster_{i}")), + nodes, + edges, + ..Default::default() + }); + } + + // Cross-cluster edges: both endpoints assigned to clusters but in different ones. + let cross_edges: Vec = all_edges + .iter() + .filter(|e| { + match ( + node_to_cluster.get(e.from.as_str()), + node_to_cluster.get(e.to.as_str()), + ) { + (Some(cf), Some(ct)) => cf != ct, + _ => false, + } + }) + .cloned() + .collect(); + + DepGraph { + edges: cross_edges, + subgraphs, + ..Default::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_graph(nodes: &[(&str, &str)], edges: &[(&str, &str)]) -> DepGraph { + DepGraph { + nodes: nodes + .iter() + .map(|(id, label)| (id.to_string(), NodeInfo::new(*label))) + .collect(), + edges: edges + .iter() + .map(|(from, to)| Edge { + from: from.to_string(), + to: to.to_string(), + ..Default::default() + }) + .collect(), + ..Default::default() + } + } + + #[test] + fn adjacency_undirected() { + let g = make_graph(&[("a", "a"), ("b", "b"), ("c", "c")], &[("a", "b")]); + let view = FlatGraphView::new(&g); + let adj = Adjacency::new(&view, false); + // a's neighbors: b (outgoing) -> [b_idx] + let a_idx = view.id_to_idx["a"].index(); + let b_idx = view.id_to_idx["b"].index(); + assert!(adj.neighbors[a_idx].contains(&b_idx)); + // b's neighbors: a (incoming, undirected) -> [a_idx] + assert!(adj.neighbors[b_idx].contains(&a_idx)); + } + + #[test] + fn adjacency_directed() { + let g = make_graph(&[("a", "a"), ("b", "b")], &[("a", "b")]); + let view = FlatGraphView::new(&g); + let adj = Adjacency::new(&view, true); + let a_idx = view.id_to_idx["a"].index(); + let b_idx = view.id_to_idx["b"].index(); + // a -> b: a has neighbor b + assert!(adj.neighbors[a_idx].contains(&b_idx)); + // b has no outgoing edges in directed mode + assert!(adj.neighbors[b_idx].is_empty()); + } + + #[test] + fn clusters_to_depgraph_basic() { + let g = make_graph( + &[("a", "a"), ("b", "b"), ("c", "c"), ("d", "d")], + &[("a", "b"), ("c", "d"), ("b", "c")], + ); + let partition = vec![vec!["a", "b"], vec!["c", "d"]]; + let result = clusters_to_depgraph(&g, &partition); + + assert_eq!(result.subgraphs.len(), 2); + assert_eq!(result.subgraphs[0].id.as_deref(), Some("cluster_0")); + assert_eq!(result.subgraphs[0].nodes.len(), 2); + assert_eq!(result.subgraphs[0].edges.len(), 1); // a->b + assert_eq!(result.subgraphs[1].id.as_deref(), Some("cluster_1")); + assert_eq!(result.subgraphs[1].nodes.len(), 2); + assert_eq!(result.subgraphs[1].edges.len(), 1); // c->d + // Cross-cluster edge: b->c + assert_eq!(result.edges.len(), 1); + assert_eq!(result.edges[0].from, "b"); + assert_eq!(result.edges[0].to, "c"); + } + + #[test] + fn clusters_to_depgraph_empty() { + let g = DepGraph::default(); + let partition: Vec> = vec![]; + let result = clusters_to_depgraph(&g, &partition); + assert!(result.subgraphs.is_empty()); + assert!(result.edges.is_empty()); + } +} diff --git a/crates/csvizmo-depgraph/src/algorithm/diff.rs b/crates/csvizmo-depgraph/src/algorithm/diff.rs new file mode 100644 index 0000000..4181eca --- /dev/null +++ b/crates/csvizmo-depgraph/src/algorithm/diff.rs @@ -0,0 +1,846 @@ +use std::collections::HashSet; +use std::io::Write; + +use indexmap::IndexMap; + +use crate::{DepGraph, Edge, NodeInfo}; + +/// Status of a node or edge in a graph diff. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DiffStatus { + Added, + Removed, + Changed, + Moved, + Unchanged, +} + +/// A node with its diff status. +#[derive(Debug)] +pub struct DiffNode { + pub status: DiffStatus, + pub info: NodeInfo, +} + +/// An edge with its diff status. +#[derive(Debug)] +pub struct DiffEdge { + pub status: DiffStatus, + pub edge: Edge, +} + +/// Result of diffing two dependency graphs. +#[derive(Debug)] +pub struct GraphDiff { + pub nodes: IndexMap, + pub edges: Vec, +} + +impl GraphDiff { + /// Returns true if any node or edge has a status other than Unchanged. + pub fn has_changes(&self) -> bool { + self.nodes + .values() + .any(|n| n.status != DiffStatus::Unchanged) + || self.edges.iter().any(|e| e.status != DiffStatus::Unchanged) + } +} + +fn node_eq(a: &NodeInfo, b: &NodeInfo) -> bool { + a.label == b.label && a.node_type == b.node_type && a.attrs == b.attrs +} + +fn edge_eq(a: &Edge, b: &Edge) -> bool { + a.label == b.label && a.attrs == b.attrs +} + +fn build_incoming(edges: &[Edge]) -> IndexMap> { + let mut incoming: IndexMap> = IndexMap::new(); + for edge in edges { + incoming + .entry(edge.to.clone()) + .or_default() + .push(edge.from.clone()); + } + incoming +} + +/// Compute the difference between two dependency graphs. +/// +/// Nodes are matched by ID. Edges are matched by (from, to) tuple. +/// Content equality for nodes compares label, node_type, and attrs. +/// Content equality for edges compares label and attrs. +/// Nodes that are unchanged in content but have a single parent that +/// changed are marked as Moved. +pub fn diff(before: &DepGraph, after: &DepGraph) -> GraphDiff { + let before_nodes = before.all_nodes(); + let after_nodes = after.all_nodes(); + let before_edges = before.all_edges(); + let after_edges = after.all_edges(); + + let mut nodes = IndexMap::new(); + + // After-graph nodes: Added, Changed, or Unchanged + for (id, after_info) in after_nodes { + let status = match before_nodes.get(id) { + Some(before_info) => { + if node_eq(before_info, after_info) { + DiffStatus::Unchanged + } else { + DiffStatus::Changed + } + } + None => DiffStatus::Added, + }; + nodes.insert( + id.clone(), + DiffNode { + status, + info: after_info.clone(), + }, + ); + } + + // Before-only nodes: Removed + for (id, before_info) in before_nodes { + if !after_nodes.contains_key(id) { + nodes.insert( + id.clone(), + DiffNode { + status: DiffStatus::Removed, + info: before_info.clone(), + }, + ); + } + } + + // Build before-edge lookup grouped by (from, to), consuming matched entries as we go + let mut before_edge_map: IndexMap<(String, String), Vec> = IndexMap::new(); + for edge in before_edges { + let key = (edge.from.clone(), edge.to.clone()); + before_edge_map.entry(key).or_default().push(edge.clone()); + } + + let mut edges = Vec::new(); + + for edge in after_edges { + let key = (edge.from.clone(), edge.to.clone()); + let status = match before_edge_map.get_mut(&key) { + Some(before_edges) => { + if let Some(pos) = before_edges.iter().position(|be| edge_eq(be, edge)) { + before_edges.swap_remove(pos); + DiffStatus::Unchanged + } else if !before_edges.is_empty() { + before_edges.swap_remove(0); + DiffStatus::Changed + } else { + DiffStatus::Added + } + } + None => DiffStatus::Added, + }; + edges.push(DiffEdge { + status, + edge: edge.clone(), + }); + } + + // Remaining before edges are Removed + for (_, remaining) in before_edge_map { + for edge in remaining { + edges.push(DiffEdge { + status: DiffStatus::Removed, + edge, + }); + } + } + + // Move detection: upgrade Unchanged nodes whose single parent changed + let before_incoming = build_incoming(before_edges); + let after_incoming = build_incoming(after_edges); + + for (id, diff_node) in &mut nodes { + if diff_node.status != DiffStatus::Unchanged { + continue; + } + let before_parents = before_incoming.get(id.as_str()); + let after_parents = after_incoming.get(id.as_str()); + match (before_parents, after_parents) { + (Some(bp), Some(ap)) if bp.len() == 1 && ap.len() == 1 && bp[0] != ap[0] => { + diff_node.status = DiffStatus::Moved; + } + _ => {} + } + } + + GraphDiff { nodes, edges } +} + +/// Build an annotated graph combining both inputs with visual diff styling. +/// +/// Added nodes/edges are green, removed are red, changed are orange, +/// moved are blue. Each element gets a `diff` attribute for programmatic +/// filtering. The after-graph's subgraph structure is preserved: nodes +/// appear in their original subgraph positions. Removed nodes (only in +/// the before-graph) are placed at root level, or into a `cluster_removed` +/// subgraph when `cluster` is true. +pub fn annotate_graph(diff: &GraphDiff, after: &DepGraph, cluster: bool) -> DepGraph { + fn annotate_node(diff_node: &DiffNode) -> NodeInfo { + let mut info = diff_node.info.clone(); + match diff_node.status { + DiffStatus::Added => { + info.label = format!("+ {}", info.label); + info.attrs.insert("color".into(), "green".into()); + info.attrs.insert("fontcolor".into(), "green".into()); + info.attrs.insert("diff".into(), "added".into()); + } + DiffStatus::Removed => { + info.label = format!("- {}", info.label); + info.attrs.insert("color".into(), "red".into()); + info.attrs.insert("fontcolor".into(), "red".into()); + info.attrs.insert("diff".into(), "removed".into()); + } + DiffStatus::Changed => { + info.label = format!("~ {}", info.label); + info.attrs.insert("color".into(), "orange".into()); + info.attrs.insert("fontcolor".into(), "orange".into()); + info.attrs.insert("diff".into(), "changed".into()); + } + DiffStatus::Moved => { + info.label = format!("> {}", info.label); + info.attrs.insert("color".into(), "blue".into()); + info.attrs.insert("fontcolor".into(), "blue".into()); + info.attrs.insert("diff".into(), "moved".into()); + } + DiffStatus::Unchanged => { + info.attrs.insert("diff".into(), "unchanged".into()); + } + } + info + } + + fn annotate_subgraph(diff: &GraphDiff, subgraph: &DepGraph) -> DepGraph { + let nodes: IndexMap = subgraph + .nodes + .keys() + .filter_map(|id| { + let diff_node = diff.nodes.get(id)?; + Some((id.clone(), annotate_node(diff_node))) + }) + .collect(); + + let subgraphs: Vec = subgraph + .subgraphs + .iter() + .map(|sg| annotate_subgraph(diff, sg)) + .filter(|sg| !sg.nodes.is_empty() || !sg.subgraphs.is_empty()) + .collect(); + + DepGraph { + id: subgraph.id.clone(), + attrs: subgraph.attrs.clone(), + nodes, + subgraphs, + ..Default::default() + } + } + + // Walk the after-graph tree to place nodes in their original positions. + let mut root = annotate_subgraph(diff, after); + + // Removed nodes are not in the after-graph; collect them separately. + let removed_nodes: IndexMap = diff + .nodes + .iter() + .filter(|(_, n)| n.status == DiffStatus::Removed) + .map(|(id, n)| (id.clone(), annotate_node(n))) + .collect(); + + if !removed_nodes.is_empty() { + if cluster { + root.subgraphs.push(DepGraph { + id: Some("cluster_removed".into()), + nodes: removed_nodes, + ..Default::default() + }); + } else { + root.nodes.extend(removed_nodes); + } + } + + // Edges stay at root level. + for diff_edge in &diff.edges { + let mut edge = diff_edge.edge.clone(); + match diff_edge.status { + DiffStatus::Added => { + edge.attrs.insert("color".into(), "green".into()); + edge.attrs.insert("diff".into(), "added".into()); + } + DiffStatus::Removed => { + edge.attrs.insert("color".into(), "red".into()); + edge.attrs.insert("diff".into(), "removed".into()); + } + DiffStatus::Changed => { + edge.attrs.insert("color".into(), "orange".into()); + edge.attrs.insert("diff".into(), "changed".into()); + } + DiffStatus::Moved => { + edge.attrs.insert("color".into(), "blue".into()); + edge.attrs.insert("diff".into(), "moved".into()); + } + DiffStatus::Unchanged => { + edge.attrs.insert("diff".into(), "unchanged".into()); + } + } + root.edges.push(edge); + } + + root +} + +/// Build a graph containing only nodes exclusive to the "before" graph. +/// +/// The before-graph's subgraph structure is preserved. Edges are included +/// only when both endpoints are removed nodes. Empty subgraphs are dropped. +pub fn subtract_graph(diff: &GraphDiff, before: &DepGraph) -> DepGraph { + let removed_ids: HashSet<&str> = diff + .nodes + .iter() + .filter(|(_, n)| n.status == DiffStatus::Removed) + .map(|(id, _)| id.as_str()) + .collect(); + + fn filter_subgraph(graph: &DepGraph, keep: &HashSet<&str>) -> DepGraph { + DepGraph { + id: graph.id.clone(), + attrs: graph.attrs.clone(), + nodes: graph + .nodes + .iter() + .filter(|(id, _)| keep.contains(id.as_str())) + .map(|(id, info)| (id.clone(), info.clone())) + .collect(), + edges: graph + .edges + .iter() + .filter(|e| keep.contains(e.from.as_str()) && keep.contains(e.to.as_str())) + .cloned() + .collect(), + subgraphs: graph + .subgraphs + .iter() + .map(|sg| filter_subgraph(sg, keep)) + .filter(|sg| !sg.nodes.is_empty() || !sg.subgraphs.is_empty()) + .collect(), + ..Default::default() + } + } + + filter_subgraph(before, &removed_ids) +} + +/// Write a tab-delimited listing of changed nodes and edges. +/// +/// Unchanged items are omitted. Node format: `\t\t