diff --git a/Cargo.toml b/Cargo.toml
index 9c623529..93aaa71c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,3 @@ members = [
     "nccl-sys",
     "torch-sys",
 ]
-
-[profile.release]
-incremental = true
diff --git a/monarch_tensor_worker/src/stream.rs b/monarch_tensor_worker/src/stream.rs
index 1ee4e7a7..c8d409c3 100644
--- a/monarch_tensor_worker/src/stream.rs
+++ b/monarch_tensor_worker/src/stream.rs
@@ -984,7 +984,7 @@ impl StreamActor {
                     }
                     let err = err.unwrap_dependent_error().unwrap_or(err);
                     WorkerError {
-                        backtrace: format!("{:?}", err),
+                        backtrace: err.to_string(),
                         worker_actor_id: worker_actor_id.clone(),
                     }
                 })
diff --git a/python/monarch/_src/actor/proc_mesh.py b/python/monarch/_src/actor/proc_mesh.py
index d0c3fc79..511c4da6 100644
--- a/python/monarch/_src/actor/proc_mesh.py
+++ b/python/monarch/_src/actor/proc_mesh.py
@@ -336,7 +336,10 @@ async def proc_mesh_nonblocking(
 ) -> ProcMesh:
     if gpus is None:
         gpus = _local_device_count()
-    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    # gpus must come last in this order otherwise test_remote_function_all_gather
+    # because test_remote_function_all_gather expects that hosts comes before gpus
+    # in the order of the dimensions.
+    spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
     env = env or {}
     cmd, args, base_env = _get_bootstrap_args()
     env.update(base_env)
@@ -350,7 +353,10 @@ def proc_mesh_blocking(
 ) -> ProcMesh:
     if gpus is None:
         gpus = _local_device_count()
-    spec = AllocSpec(AllocConstraints(), gpus=gpus, hosts=hosts)
+    # gpus must come last in this order otherwise test_remote_function_all_gather
+    # because test_remote_function_all_gather expects that hosts comes before gpus
+    # in the order of the dimensions.
+    spec = AllocSpec(AllocConstraints(), hosts=hosts, gpus=gpus)
     env = env or {}
     cmd, args, base_env = _get_bootstrap_args()
     env.update(base_env)
diff --git a/python/monarch/mesh_controller.py b/python/monarch/mesh_controller.py
index 04225fe1..b36e37a9 100644
--- a/python/monarch/mesh_controller.py
+++ b/python/monarch/mesh_controller.py
@@ -124,6 +124,9 @@ def _initialize_env(worker_point: Point, proc_id: str) -> None:
         }
         os.environ.update(process_env)
         pdb.set_trace = _set_trace
+        # workaround for set_manual_seed somehow not working if cuda is not initialized\
+        if torch.cuda.is_available():
+            torch.cuda.init()
     except Exception:
         traceback.print_exc()
         raise
@@ -248,7 +251,7 @@ def __str__(self):
             return (
                 f"A remote function has failed asynchronously on rank {self.rank}.\n"
                 f"Traceback of where the remote function was issued on controller (most recent call last):\n{controller_tb}"
-                f"Error as reported from worker!!!!!!!:\n{self.worker_error_string}"
+                f"Error as reported from worker:\n{self.worker_error_string}"
             )
         except Exception:
             traceback.print_exc()
diff --git a/python/tests/test_python_actors.py b/python/tests/test_python_actors.py
index db597bb9..7092844e 100644
--- a/python/tests/test_python_actors.py
+++ b/python/tests/test_python_actors.py
@@ -491,7 +491,7 @@ def _patch_output(msg):
             rank, coords, _, _, function, lineno = breakpoints[i]
             initial_linenos[rank] = lineno
             assert rank == i
-            assert coords == {"hosts": rank % 2, "gpus": rank // 2}
+            assert coords == {"hosts": rank // 2, "gpus": rank % 2}
             assert function == "test_python_actors._debugee_actor_internal"
             assert lineno == breakpoints[0][5] + 4 * rank
 
diff --git a/python/tests/test_remote_functions.py b/python/tests/test_remote_functions.py
index 7b1b3398..d085c45a 100644
--- a/python/tests/test_remote_functions.py
+++ b/python/tests/test_remote_functions.py
@@ -185,7 +185,9 @@ def local_device_mesh(
 # out is not counted as a failure, so we set a more restrictive timeout to
 # ensure we see a hard failure in CI.
 @pytest.mark.timeout(120)
-@pytest.mark.parametrize("backend_type", [BackendType.PY, BackendType.RS])
+@pytest.mark.parametrize(
+    "backend_type", [BackendType.PY, BackendType.RS, BackendType.MESH]
+)
 class TestRemoteFunctions(RemoteFunctionsTestBase):
     @classmethod
     def do_test_reduce_scatter_tensor(cls, backend_type, reduce_op, expected_tensor):
@@ -952,10 +954,13 @@ def test_remote_function_failure_message_contains_traceback(self, backend_type):
             x = outer_remote_function_that_calls_inner()
             try:
                 inspect(x)
-            except RemoteException as e:
+            except OldRemoteException as e:
                 backtrace = "\n".join([frame.name for frame in e.worker_frames])
                 assert "outer_remote_function" in backtrace
                 assert "inner_remote_function" in backtrace
+            except NewRemoteException as e:
+                assert "outer_remote_function" in e.worker_error_string
+                assert "inner_remote_function" in e.worker_error_string
 
     def test_remote_function_broadcast(self, backend_type):
         with self.local_device_mesh(2, 2, backend_type) as device_mesh:
diff --git a/torch-sys/Cargo.toml b/torch-sys/Cargo.toml
index aee7d187..f8529719 100644
--- a/torch-sys/Cargo.toml
+++ b/torch-sys/Cargo.toml
@@ -12,6 +12,7 @@ links = "torch"
 anyhow = "1.0.98"
 async-trait = "0.1.86"
 atomic_refcell = "0.1.13"
+bincode = "1.3.3"
 cxx = "1.0.119"
 derive_more = { version = "1.0.0", features = ["full"] }
 monarch_types = { version = "0.0.0", path = "../monarch_types" }
@@ -24,9 +25,6 @@ thiserror = "2.0.12"
 tokio = { version = "1.45.0", features = ["full", "test-util", "tracing"] }
 tracing = { version = "0.1.41", features = ["attributes", "valuable"] }
 
-[dev-dependencies]
-bincode = "1.3.3"
-
 [build-dependencies]
 bindgen = "0.70.1"
 cxx-build = "1.0.119"
diff --git a/torch-sys/src/ivalue.rs b/torch-sys/src/ivalue.rs
index 592d58cd..34e2217f 100644
--- a/torch-sys/src/ivalue.rs
+++ b/torch-sys/src/ivalue.rs
@@ -150,7 +150,8 @@ impl Clone for OpaqueIValue {
     /// This creates a deep copy of the underlying data and can be expensive.
     /// It might also panic if the `IValue` is not cloneable.
     fn clone(&self) -> Self {
-        Self(ffi::ivalue_deepcopy(&self.0).unwrap())
+        let serialized = bincode::serialize(&self.0).unwrap();
+        bincode::deserialize(&serialized).unwrap()
     }
 }