Skip to content

Commit f5d3e82

Browse files
andrewjcgfacebook-github-bot
authored andcommitted
Invert rsync architecture (#467)
Summary: Pull Request resolved: #467 This inverts the rsync setup, so that the rsync daemon runs on the "client", and the rsync clients run on the actors. This helps in a couple ways: - We don't leave a rsync daemon running on the actors. - We avoid spawning lots of sub-processes for each rsync client on the "client" - The rsync client supports reporting file changes, which subsequent changes can use to facilitate things like targeted module reloading (based on the actual things that were changed). Differential Revision: D77952087
1 parent 1e06a7c commit f5d3e82

File tree

1 file changed

+114
-35
lines changed

1 file changed

+114
-35
lines changed

hyperactor_mesh/src/code_sync/rsync.rs

Lines changed: 114 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,16 @@ use anyhow::bail;
1919
use anyhow::ensure;
2020
use async_trait::async_trait;
2121
use futures::StreamExt;
22+
use futures::TryFutureExt;
2223
use futures::TryStreamExt;
2324
use futures::stream;
2425
use futures::try_join;
2526
use hyperactor::Actor;
27+
use hyperactor::Bind;
2628
use hyperactor::Handler;
2729
use hyperactor::Named;
30+
use hyperactor::OncePortRef;
31+
use hyperactor::Unbind;
2832
use hyperactor::clock::Clock;
2933
use hyperactor::clock::RealClock;
3034
use nix::sys::signal;
@@ -56,8 +60,8 @@ pub async fn do_rsync(addr: &SocketAddr, workspace: &Path) -> Result<()> {
5660
.arg("--delay-updates")
5761
.arg("--exclude=.rsync-tmp.*")
5862
.arg(format!("--partial-dir=.rsync-tmp.{}", addr.port()))
59-
.arg(format!("{}/", workspace.display()))
6063
.arg(format!("rsync://{}/workspace", addr))
64+
.arg(format!("{}/", workspace.display()))
6165
.stderr(Stdio::piped())
6266
.output()
6367
.await?;
@@ -89,8 +93,8 @@ impl RsyncDaemon {
8993
path = {workspace}
9094
use chroot = no
9195
list = no
92-
read only = false
93-
write only = true
96+
read only = true
97+
write only = false
9498
uid = {uid}
9599
hosts allow = localhost
96100
"#,
@@ -158,43 +162,58 @@ impl RsyncDaemon {
158162
}
159163
}
160164

165+
#[derive(Debug, Named, Serialize, Deserialize, Bind, Unbind)]
166+
pub struct RsyncMessage {
167+
/// The connect message to create a duplex bytestream with the client.
168+
pub connect: Connect,
169+
/// A port to send back any errors from the rsync.
170+
pub result: OncePortRef<Result<(), String>>,
171+
}
172+
161173
#[derive(Debug, Named, Serialize, Deserialize)]
162174
pub struct RsyncParams {
163175
pub workspace: WorkspaceLocation,
164176
}
165177

166178
#[derive(Debug)]
167-
#[hyperactor::export(
168-
spawn = true,
169-
handlers = [Connect { cast = true }],
170-
)]
179+
#[hyperactor::export(spawn = true, handlers = [RsyncMessage { cast = true }])]
171180
pub struct RsyncActor {
172-
daemon: RsyncDaemon,
181+
workspace: WorkspaceLocation,
182+
//daemon: RsyncDaemon,
173183
}
174184

175185
#[async_trait]
176186
impl Actor for RsyncActor {
177187
type Params = RsyncParams;
178188

179189
async fn new(RsyncParams { workspace }: Self::Params) -> Result<Self> {
180-
let workspace = workspace.resolve()?;
181-
let daemon = RsyncDaemon::spawn(TcpListener::bind(("::1", 0)).await?, &workspace).await?;
182-
Ok(Self { daemon })
190+
Ok(Self { workspace })
183191
}
184192
}
185193

186194
#[async_trait]
187-
impl Handler<Connect> for RsyncActor {
195+
impl Handler<RsyncMessage> for RsyncActor {
188196
async fn handle(
189197
&mut self,
190198
cx: &hyperactor::Context<Self>,
191-
message: Connect,
199+
RsyncMessage { connect, result }: RsyncMessage,
192200
) -> Result<(), anyhow::Error> {
193-
let (mut local, mut stream) = try_join!(
194-
async { Ok(TcpStream::connect(self.daemon.addr()).await?) },
195-
accept(cx, cx.self_id().clone(), message),
196-
)?;
197-
tokio::io::copy_bidirectional(&mut local, &mut stream).await?;
201+
let res = async {
202+
let workspace = self.workspace.resolve()?;
203+
let (listener, mut stream) = try_join!(
204+
TcpListener::bind(("::1", 0)).err_into(),
205+
accept(cx, cx.self_id().clone(), connect),
206+
)?;
207+
let addr = listener.local_addr()?;
208+
try_join!(do_rsync(&addr, &workspace), async move {
209+
let (mut local, _) = listener.accept().await?;
210+
tokio::io::copy_bidirectional(&mut stream, &mut local).await?;
211+
anyhow::Ok(())
212+
},)?;
213+
anyhow::Ok(())
214+
}
215+
.await;
216+
result.send(cx, res.map_err(|e| format!("{:#?}", e)))?;
198217
Ok(())
199218
}
200219
}
@@ -203,45 +222,63 @@ pub async fn rsync_mesh<M>(actor_mesh: &M, workspace: PathBuf) -> Result<()>
203222
where
204223
M: ActorMesh<Actor = RsyncActor>,
205224
{
225+
// Spawn a rsync daemon to acceopt incoming connections from actors.
226+
let daemon = RsyncDaemon::spawn(TcpListener::bind(("::1", 0)).await?, &workspace).await?;
227+
let daemon_addr = daemon.addr();
228+
206229
// We avoid casting here as we need point-to-point connections to each individual actor.
207230
stream::iter(actor_mesh.iter_actor_refs())
231+
.map(anyhow::Ok)
208232
// Connect to all actors in the mesh.
209-
.map(|actor| async move {
233+
.try_for_each_concurrent(None, |actor| async move {
210234
let mailbox = actor_mesh.proc_mesh().client();
211235
let (connect, completer) = Connect::allocate(mailbox.actor_id().clone(), mailbox);
212-
actor.send(mailbox, connect)?;
213-
completer.complete().await
214-
})
215-
// Max the connections run in parallel.
216-
.buffer_unordered(usize::MAX)
217-
// Initiate the rsync, in parallel.
218-
.try_for_each_concurrent(None, |mut local| async {
219-
let workspace = workspace.clone();
220-
let listener = TcpListener::bind(("::1", 0)).await?;
221-
let addr = listener.local_addr()?;
222-
try_join!(
223-
async move { do_rsync(&addr, &workspace).await },
224-
async move {
225-
let (mut stream, _) = listener.accept().await?;
226-
tokio::io::copy_bidirectional(&mut stream, &mut local).await?;
227-
anyhow::Ok(())
236+
let (tx, rx) = mailbox.open_once_port::<Result<(), String>>();
237+
actor.send(
238+
mailbox,
239+
RsyncMessage {
240+
connect,
241+
result: tx.bind(),
228242
},
229243
)?;
244+
let (mut local, mut stream) = try_join!(
245+
TcpStream::connect(daemon_addr.clone()).err_into(),
246+
completer.complete(),
247+
)?;
248+
// Pipe the remote rsync client to the local rsync server, but don't propagate failures yet.
249+
let copy_res = tokio::io::copy_bidirectional(&mut local, &mut stream).await;
250+
// Now wait for the final result to be sent back. We wrap in a timeout, as we should get this
251+
// back pretty quickly after the copy above is done.
252+
let () = RealClock
253+
.timeout(Duration::from_secs(1), rx.recv())
254+
.await??
255+
.map_err(anyhow::Error::msg)
256+
.with_context(|| format!("actor: {}", actor.actor_id()))?;
257+
// Finally, propagate any copy errors, in case there were some but not result error.
258+
let _ = copy_res?;
230259
anyhow::Ok(())
231260
})
232261
.await?;
262+
263+
daemon.shutdown().await?;
264+
233265
Ok(())
234266
}
235267

236268
#[cfg(test)]
237269
mod tests {
238270
use anyhow::Result;
239271
use anyhow::anyhow;
272+
use ndslice::shape;
240273
use tempfile::TempDir;
241274
use tokio::fs;
242275
use tokio::net::TcpListener;
243276

244277
use super::*;
278+
use crate::alloc::AllocSpec;
279+
use crate::alloc::Allocator;
280+
use crate::alloc::local::LocalAllocator;
281+
use crate::proc_mesh::ProcMesh;
245282

246283
#[tokio::test]
247284
async fn test_simple() -> Result<()> {
@@ -259,4 +296,46 @@ mod tests {
259296

260297
Ok(())
261298
}
299+
300+
#[tokio::test]
301+
async fn test_rsync_actor_and_mesh() -> Result<()> {
302+
// Create source workspace with test files
303+
let source_workspace = TempDir::new()?;
304+
fs::write(source_workspace.path().join("test1.txt"), "content1").await?;
305+
fs::write(source_workspace.path().join("test2.txt"), "content2").await?;
306+
fs::create_dir(source_workspace.path().join("subdir")).await?;
307+
fs::write(source_workspace.path().join("subdir/test3.txt"), "content3").await?;
308+
309+
// Create target workspace for the actors
310+
let target_workspace = TempDir::new()?;
311+
312+
// Set up actor mesh with 2 RsyncActors
313+
let alloc = LocalAllocator
314+
.allocate(AllocSpec {
315+
shape: shape! { replica = 1 },
316+
constraints: Default::default(),
317+
})
318+
.await?;
319+
320+
let proc_mesh = ProcMesh::allocate(alloc).await?;
321+
322+
// Create RsyncParams - all actors will use the same target workspace for this test
323+
let params = RsyncParams {
324+
workspace: WorkspaceLocation::Constant(target_workspace.path().to_path_buf()),
325+
};
326+
327+
// Spawn actor mesh with RsyncActors
328+
let actor_mesh = proc_mesh.spawn::<RsyncActor>("rsync_test", &params).await?;
329+
330+
// Test rsync_mesh function - this coordinates rsync operations across the mesh
331+
rsync_mesh(&actor_mesh, source_workspace.path().to_path_buf()).await?;
332+
333+
// Verify we copied correctly.
334+
assert!(
335+
!dir_diff::is_different(&source_workspace, &target_workspace)
336+
.map_err(|e| anyhow!("{:?}", e))?
337+
);
338+
339+
Ok(())
340+
}
262341
}

0 commit comments

Comments
 (0)