Skip to content

Commit e097f59

Browse files
andrewjcgfacebook-github-bot
authored andcommitted
Invert rsync architecture (#467)
Summary: Pull Request resolved: #467 This inverts the rsync setup, so that the rsync daemon runs on the "client", and the rsync clients run on the actors. This helps in a couple ways: - We don't leave a rsync daemon running on the actors. - We avoid spawning lots of sub-processes for each rsync client on the "client" - The rsync client supports reporting file changes, which subsequent changes can use to facilitate things like targeted module reloading (based on the actual things that were changed). Differential Revision: D77952087
1 parent 1e06a7c commit e097f59

File tree

1 file changed

+114
-35
lines changed

1 file changed

+114
-35
lines changed

hyperactor_mesh/src/code_sync/rsync.rs

Lines changed: 114 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,21 @@ use std::time::Duration;
1515

1616
use anyhow::Context;
1717
use anyhow::Result;
18+
use anyhow::anyhow;
1819
use anyhow::bail;
1920
use anyhow::ensure;
2021
use async_trait::async_trait;
2122
use futures::StreamExt;
23+
use futures::TryFutureExt;
2224
use futures::TryStreamExt;
2325
use futures::stream;
2426
use futures::try_join;
2527
use hyperactor::Actor;
28+
use hyperactor::Bind;
2629
use hyperactor::Handler;
2730
use hyperactor::Named;
31+
use hyperactor::OncePortRef;
32+
use hyperactor::Unbind;
2833
use hyperactor::clock::Clock;
2934
use hyperactor::clock::RealClock;
3035
use nix::sys::signal;
@@ -56,8 +61,8 @@ pub async fn do_rsync(addr: &SocketAddr, workspace: &Path) -> Result<()> {
5661
.arg("--delay-updates")
5762
.arg("--exclude=.rsync-tmp.*")
5863
.arg(format!("--partial-dir=.rsync-tmp.{}", addr.port()))
59-
.arg(format!("{}/", workspace.display()))
6064
.arg(format!("rsync://{}/workspace", addr))
65+
.arg(format!("{}/", workspace.display()))
6166
.stderr(Stdio::piped())
6267
.output()
6368
.await?;
@@ -89,8 +94,8 @@ impl RsyncDaemon {
8994
path = {workspace}
9095
use chroot = no
9196
list = no
92-
read only = false
93-
write only = true
97+
read only = true
98+
write only = false
9499
uid = {uid}
95100
hosts allow = localhost
96101
"#,
@@ -158,43 +163,58 @@ impl RsyncDaemon {
158163
}
159164
}
160165

166+
#[derive(Debug, Named, Serialize, Deserialize, Bind, Unbind)]
167+
pub struct RsyncMessage {
168+
/// The connect message to create a duplex bytestream with the client.
169+
pub connect: Connect,
170+
/// A port to send back any errors from the rsync.
171+
pub result: OncePortRef<Result<(), String>>,
172+
}
173+
161174
#[derive(Debug, Named, Serialize, Deserialize)]
162175
pub struct RsyncParams {
163176
pub workspace: WorkspaceLocation,
164177
}
165178

166179
#[derive(Debug)]
167-
#[hyperactor::export(
168-
spawn = true,
169-
handlers = [Connect { cast = true }],
170-
)]
180+
#[hyperactor::export(spawn = true, handlers = [RsyncMessage { cast = true }])]
171181
pub struct RsyncActor {
172-
daemon: RsyncDaemon,
182+
workspace: WorkspaceLocation,
183+
//daemon: RsyncDaemon,
173184
}
174185

175186
#[async_trait]
176187
impl Actor for RsyncActor {
177188
type Params = RsyncParams;
178189

179190
async fn new(RsyncParams { workspace }: Self::Params) -> Result<Self> {
180-
let workspace = workspace.resolve()?;
181-
let daemon = RsyncDaemon::spawn(TcpListener::bind(("::1", 0)).await?, &workspace).await?;
182-
Ok(Self { daemon })
191+
Ok(Self { workspace })
183192
}
184193
}
185194

186195
#[async_trait]
187-
impl Handler<Connect> for RsyncActor {
196+
impl Handler<RsyncMessage> for RsyncActor {
188197
async fn handle(
189198
&mut self,
190199
cx: &hyperactor::Context<Self>,
191-
message: Connect,
200+
RsyncMessage { connect, result }: RsyncMessage,
192201
) -> Result<(), anyhow::Error> {
193-
let (mut local, mut stream) = try_join!(
194-
async { Ok(TcpStream::connect(self.daemon.addr()).await?) },
195-
accept(cx, cx.self_id().clone(), message),
196-
)?;
197-
tokio::io::copy_bidirectional(&mut local, &mut stream).await?;
202+
let res = async {
203+
let workspace = self.workspace.resolve()?;
204+
let (listener, mut stream) = try_join!(
205+
TcpListener::bind(("::1", 0)).err_into(),
206+
accept(cx, cx.self_id().clone(), connect),
207+
)?;
208+
let addr = listener.local_addr()?;
209+
try_join!(do_rsync(&addr, &workspace), async move {
210+
let (mut local, _) = listener.accept().await?;
211+
tokio::io::copy_bidirectional(&mut stream, &mut local).await?;
212+
anyhow::Ok(())
213+
},)?;
214+
anyhow::Ok(())
215+
}
216+
.await;
217+
result.send(cx, res.map_err(|e| format!("{:#?}", e)))?;
198218
Ok(())
199219
}
200220
}
@@ -203,45 +223,62 @@ pub async fn rsync_mesh<M>(actor_mesh: &M, workspace: PathBuf) -> Result<()>
203223
where
204224
M: ActorMesh<Actor = RsyncActor>,
205225
{
226+
// Spawn a rsync daemon to acceopt incoming connections from actors.
227+
let daemon = RsyncDaemon::spawn(TcpListener::bind(("::1", 0)).await?, &workspace).await?;
228+
let daemon_addr = daemon.addr();
229+
206230
// We avoid casting here as we need point-to-point connections to each individual actor.
207231
stream::iter(actor_mesh.iter_actor_refs())
232+
.map(anyhow::Ok)
208233
// Connect to all actors in the mesh.
209-
.map(|actor| async move {
234+
.try_for_each_concurrent(None, |actor| async move {
210235
let mailbox = actor_mesh.proc_mesh().client();
211236
let (connect, completer) = Connect::allocate(mailbox.actor_id().clone(), mailbox);
212-
actor.send(mailbox, connect)?;
213-
completer.complete().await
214-
})
215-
// Max the connections run in parallel.
216-
.buffer_unordered(usize::MAX)
217-
// Initiate the rsync, in parallel.
218-
.try_for_each_concurrent(None, |mut local| async {
219-
let workspace = workspace.clone();
220-
let listener = TcpListener::bind(("::1", 0)).await?;
221-
let addr = listener.local_addr()?;
222-
try_join!(
223-
async move { do_rsync(&addr, &workspace).await },
224-
async move {
225-
let (mut stream, _) = listener.accept().await?;
226-
tokio::io::copy_bidirectional(&mut stream, &mut local).await?;
227-
anyhow::Ok(())
237+
let (tx, rx) = mailbox.open_once_port::<Result<(), String>>();
238+
actor.send(
239+
mailbox,
240+
RsyncMessage {
241+
connect,
242+
result: tx.bind(),
228243
},
229244
)?;
245+
let (mut local, mut stream) = try_join!(
246+
TcpStream::connect(daemon_addr.clone()).err_into(),
247+
completer.complete(),
248+
)?;
249+
// Pipe the remote rsync client to the local rsync server, but don't propagate failures yet.
250+
let copy_res = tokio::io::copy_bidirectional(&mut local, &mut stream).await;
251+
// Now wait for the final result to be sent back. We wrap in a timeout, as we should get this
252+
// back pretty quickly after the copy above is done.
253+
let () = RealClock
254+
.timeout(Duration::from_secs(1), rx.recv())
255+
.await??
256+
.map_err(|err| anyhow!("failure from {}: {}", actor.actor_id(), err))?;
257+
// Finally, propagate any copy errors, in case there were some but not result error.
258+
let _ = copy_res?;
230259
anyhow::Ok(())
231260
})
232261
.await?;
262+
263+
daemon.shutdown().await?;
264+
233265
Ok(())
234266
}
235267

236268
#[cfg(test)]
237269
mod tests {
238270
use anyhow::Result;
239271
use anyhow::anyhow;
272+
use ndslice::shape;
240273
use tempfile::TempDir;
241274
use tokio::fs;
242275
use tokio::net::TcpListener;
243276

244277
use super::*;
278+
use crate::alloc::AllocSpec;
279+
use crate::alloc::Allocator;
280+
use crate::alloc::local::LocalAllocator;
281+
use crate::proc_mesh::ProcMesh;
245282

246283
#[tokio::test]
247284
async fn test_simple() -> Result<()> {
@@ -259,4 +296,46 @@ mod tests {
259296

260297
Ok(())
261298
}
299+
300+
#[tokio::test]
301+
async fn test_rsync_actor_and_mesh() -> Result<()> {
302+
// Create source workspace with test files
303+
let source_workspace = TempDir::new()?;
304+
fs::write(source_workspace.path().join("test1.txt"), "content1").await?;
305+
fs::write(source_workspace.path().join("test2.txt"), "content2").await?;
306+
fs::create_dir(source_workspace.path().join("subdir")).await?;
307+
fs::write(source_workspace.path().join("subdir/test3.txt"), "content3").await?;
308+
309+
// Create target workspace for the actors
310+
let target_workspace = TempDir::new()?;
311+
312+
// Set up actor mesh with 2 RsyncActors
313+
let alloc = LocalAllocator
314+
.allocate(AllocSpec {
315+
shape: shape! { replica = 1 },
316+
constraints: Default::default(),
317+
})
318+
.await?;
319+
320+
let proc_mesh = ProcMesh::allocate(alloc).await?;
321+
322+
// Create RsyncParams - all actors will use the same target workspace for this test
323+
let params = RsyncParams {
324+
workspace: WorkspaceLocation::Constant(target_workspace.path().to_path_buf()),
325+
};
326+
327+
// Spawn actor mesh with RsyncActors
328+
let actor_mesh = proc_mesh.spawn::<RsyncActor>("rsync_test", &params).await?;
329+
330+
// Test rsync_mesh function - this coordinates rsync operations across the mesh
331+
rsync_mesh(&actor_mesh, source_workspace.path().to_path_buf()).await?;
332+
333+
// Verify we copied correctly.
334+
assert!(
335+
!dir_diff::is_different(&source_workspace, &target_workspace)
336+
.map_err(|e| anyhow!("{:?}", e))?
337+
);
338+
339+
Ok(())
340+
}
262341
}

0 commit comments

Comments
 (0)