pbroadcast

chaserileyroberts · chaserileyroberts · commit 01412f7645fb · 2024-03-18T15:12:33.000-07:00
diff --git a/jax/_src/lax/parallel.py b/jax/_src/lax/parallel.py
@@ -247,6 +247,36 @@ def _canonicalize_axis_index_groups(axis_index_groups):
     return
   return tuple(map(tuple, axis_index_groups))
 
+
+def pbroadcast(x, axis_name, source):
+  """Perform a collective broadcast and replicate from ``source``.
+
+  This is equivalent to
+  ```
+  def pbroadcast(x, axis_name, source):
+    masked = jnp.where(axis_index(axis_name) == source, x, zeros_like(x))
+    return psum(masked, axis_name)
+  ```
+  but implemented in a hardware optimized way.
+
+  If ``x`` is a pytree then the result is equivalent to mapping this function to
+  each leaf in the tree.
+
+  This function is an analog of the CollectiveBroadcast HLO.
+
+  Args:
+    x: array(s) with a mapped axis named ``axis_name``.
+    axis_name: hashable Python object used to name a pmapped axis (see the
+      :func:`jax.pmap` documentation for more details).
+    source: int, representing which index into ``axis_name`` that should be copied.
+
+  Returns:
+    Array(s) with ``x`` being copied from the ``source`` index slice of ``axis_name``.
+  """
+  return tree_util.tree_map(
+      partial(pbroadcast_p.bind, axis_name=axis_name, source=source), x)
+
+
 def ppermute(x, axis_name, perm):
   """Perform a collective permutation according to the permutation ``perm``.
 
@@ -927,6 +957,43 @@ def _collective_batcher(prim, args, dims, **params):
 batching.axis_primitive_batchers[ppermute_p] = _ppermute_batcher
 core.axis_substitution_rules[ppermute_p] = partial(_subst_all_names_in_param, 'axis_name')
 
+def _pbroadcast_transpose_rule(t, x, source, axis_name):
+  is_source = axis_index(axis_name) == source
+  tsum = psum(t, axis_name)
+  return [lax_numpy.where(is_source, tsum, lax_numpy.zeros_like(t))]
+
+def _pbroadcast_batcher(axis_size, frame_name, _, vals_in, dims_in, axis_name, source):
+  (v,), (d,) = vals_in, dims_in
+  if not isinstance(axis_name, (tuple, list)):
+    axis_name = (axis_name,)
+  remaining_axes = tuple(axis for axis in axis_name if axis != frame_name)
+  if remaining_axes:
+    raise NotImplementedError("pbroadcast batcher only supports a single axis")
+  assert axis_name[0] == frame_name, "pbroadcast batcher called with a wrong axis!"
+  assert source >= 0 and source < axis_size, "collective broadcast doesn't fit in the axis size!"
+  if axis_size == 1 and remaining_axes:
+    return pbroadcast_p.bind(v, source=source, axis_name=remaining_axes), d
+  if d is batching.not_mapped:
+    return v, d
+  return lax_numpy.take(v, [source] * axis_size, d), d
+
+def _pbroadcast_lowering(ctx, x, *, axis_name, source):
+  replica_groups = _replica_groups(ctx.module_context.axis_env, axis_name, None)
+  def source_to_front(group):
+    return [group[source]] + list(group[:source]) + list(group[source + 1:])
+  replica_groups = [source_to_front(group) for group in replica_groups]
+  channel = ctx.module_context.new_channel()
+  return hlo.CollectiveBroadcastOp(
+      x, replica_groups=_replica_groups_hlo(replica_groups)).results
+
+pbroadcast_p = core.AxisPrimitive('pbroadcast')
+pbroadcast_p.def_abstract_eval(lambda x, **params: raise_to_shaped(x))
+ad.deflinear2(pbroadcast_p, _pbroadcast_transpose_rule)
+mlir.register_lowering(pbroadcast_p, _pbroadcast_lowering)
+batching.primitive_batchers[pbroadcast_p] = partial(_collective_batcher, pbroadcast_p)
+batching.axis_primitive_batchers[pbroadcast_p] = _pbroadcast_batcher
+core.axis_substitution_rules[pbroadcast_p] = partial(_subst_all_names_in_param, 'axis_name')
+
 
 def _moveaxis(src, dst, x):
   perm = [i for i in range(x.ndim) if i != src]
diff --git a/jax/lax/__init__.py b/jax/lax/__init__.py
@@ -342,6 +342,7 @@
   all_to_all_p as all_to_all_p,
   axis_index as axis_index,
   axis_index_p as axis_index_p,
+  pbroadcast as pbroadcast,
   pmax as pmax,
   pmax_p as pmax_p,
   pmean as pmean,
diff --git a/tests/pmap_test.py b/tests/pmap_test.py
@@ -1109,12 +1109,41 @@ def testAxisGroups(self):
     self.assertEqual((tuple(sorted(groups[0])),),
                      ((0, 1, 2, 3, 4, 5, 6, 7,),))  # order doesn't matter
 
+  @jtu.skip_on_devices("cpu", "tpu")
+  def testCollectiveBroadcast(self):
+    device_count = jax.device_count()
+    f = lambda x: lax.pbroadcast(x, source=0, axis_name='i')
+    f = self.pmap(f, 'i')
+    x = jnp.arange(4 * device_count).reshape((device_count, 4))
+    ans = f(x)
+    expected = np.take(x, [0] * device_count, axis=0)
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  @jtu.skip_on_devices("cpu", "tpu")
+  def testCollectiveBroadcastVmap(self):
+    device_count = jax.device_count()
+    f = lambda x: lax.pbroadcast(x, source=0, axis_name='i')
+    x = np.arange(device_count * 16, dtype=np.float32)
+    x = x.reshape((device_count, 4, 4))
+    ans = self.pmap(vmap(f), 'i')(x)
+    expected = jnp.broadcast_to(x[0:1], x.shape)
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  @jtu.skip_on_devices("cpu", "tpu")
+  def testCollectiveBroadcastGrad(self):
+    device_count = jax.device_count()
+    f = lambda x: lax.pbroadcast(x, source=0, axis_name='i')
+    x = np.arange(device_count, dtype=np.float32)
+    ans = self.pmap(grad(f), 'i')(x)
+    expected = np.zeros_like(x)
+    expected[0] = device_count
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
   def testCollectivePermute(self):
     device_count = jax.device_count()
     rotation = [(i, (i + 1) % device_count) for i in range(device_count)]
     f = lambda x: lax.ppermute(x, perm=rotation, axis_name='i')
     f = self.pmap(f, 'i')
-
     x = jnp.arange(4 * device_count).reshape((device_count, 4))
     ans = f(x)
     expected = np.roll(x, shift=1, axis=0)