RoboSegNet/index.html at main · BerkeleyAutomation/RoboSegNet · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="RoboSegNet learns multi-task robot trajectory segmentation from fused visual and kinematic streams, with a RoboSegData benchmark from Agibot.">
  <meta property="og:title" content="RoboSegNet: Multi-Task Robot Trajectory Segmentation"/>
  <meta property="og:description" content="Joint visual–kinematic segmentation with DCT tokenization, ViT features, cross-modal attention, and Hungarian matching. Introduces RoboSegData."/>
  <meta property="og:url" content="https://berkeleyautomation.github.io/RoboSegNet/"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/images/robosegnet_overview.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="RoboSegNet: Multi-Task Robot Trajectory Segmentation">
  <meta name="twitter:description" content="Joint visual–kinematic trajectory segmentation with RoboSegData benchmark.">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/robosegnet_overview.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="robot learning, trajectory segmentation, multimodal learning, manipulation, proprioception, computer vision, RoboSegNet, RoboSegData">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>RoboSegNet | Multi-Task Robot Trajectory Segmentation</title>
  <link rel="icon" type="image/x-icon" href="static/images/Wizard_Icon.png">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-2 publication-title">Learning Multi-Task Robot Trajectory Segmentation <br/>
              from Visual and Kinematic Streams</h1>
            <!-- <p class="subtitle is-4 has-text-weight-semibold" style="margin-top: 0.5rem;">RoboSegNet</p> -->
            <div class="is-size-5 publication-authors">
              <span class="author-block">
                <a href="https://keplerc.github.io/" target="_blank">Kaiyuan Chen</a><sup>*,1</sup>,
              </span>
              <span class="author-block">
                <a href="https://ehehee.github.io/" target="_blank">Shuangyu Xie</a><sup>*,1</sup>,
              </span>
              <span class="author-block">
                <a href="https://www.linkedin.com/in/andrewpgoldberg/" target="_blank">Andrew Goldberg</a><sup>1</sup>,
              </span>
              <span class="author-block">
                <a href="https://goldberg.berkeley.edu/" target="_blank">Ken Goldberg</a><sup>1</sup>
              </span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><sup>1</sup>University of California, Berkeley</span>
              <span class="eql-cntrb"><small><br><sup>*</sup>Equal contribution</small></span>
            </div>

            <!-- <div class="column has-text-centered">
              <div class="publication-links">
                <span class="link-block">
                  <a href="https://github.com/BerkeleyAutomation/robosegnet" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>
              </div>
            </div> -->
          </div>
        </div>
      </div>
    </div>
</section>


<!-- Teaser / Figure 1 -->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="static/images/roboseg-title.png" alt="RoboSegNet overview: visual and kinematic streams fused to predict trajectory segment boundaries" style="max-width: 100%; height: auto;">
      <h2 class="subtitle has-text-centered">
        <strong>Figure 1.</strong> RoboSegNet processes an image stream (DINOv3) and a kinematic stream (FAST), fuses them with cross-attention (frequency synchronization), and uses a transformer encoder–decoder to predict transition probabilities over time—for example segmenting an insertion into phases such as Grasp, Insert, and Nudge.
      </h2>
    </div>
  </div>
</section>
<!-- End teaser -->

<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Segmenting robot demonstration trajectories into semantically coherent parts is key to efficient policy learning, skill reuse, and recovery from failure. We present <strong>RoboSegNet</strong>, a multi-task framework that jointly learns from <strong>visual</strong> and <strong>kinematic</strong> proprioceptive signals. Kinematic trajectories are encoded with a <strong>Discrete Cosine Transform (DCT)-based tokenizer</strong>; images are encoded with a <strong>visual transformer</strong>. The two modalities are fused with <strong>bidirectional cross-modal attention</strong>, and transition boundaries are predicted via <strong>Hungarian matching</strong>. We introduce <strong>RoboSegData</strong>, a benchmark built from the Agibot dataset with dense frame-level transition annotations. RoboSegNet achieves strong performance on RoboSegData and generalizes in <strong>zero shot</strong> to unseen scenes, tasks, and skills.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->


<!--BibTex citation -->
  <!-- <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@misc{chen2026robosegnet,
  title={Learning Multi-Task Robot Trajectory Segmentation from Visual and Kinematic Streams},
  author={Chen, Kaiyuan and Xie, Shuangyu and Goldberg, Andrew and Goldberg, Ken},
  year={2026},
  url={https://berkeleyautomation.github.io/robosegnet/},
  note={Project page; add arXiv identifier when available}
}</code></pre>
    </div>
</section> -->
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the source code of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->

<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>