Support auto in shard_map.

pschuh · jax authors · commit 7ba811eb4a7a · 2024-04-22T14:29:35.000-07:00
- Pull mesh from NamedSharding when rewriting manual axes.
- Properly set manual axes in SPMDAxisContext in shard_map.
- Properly set dims as unspecified inside shard_map.

PiperOrigin-RevId: 627156892
diff --git a/jax/_src/pjit.py b/jax/_src/pjit.py
@@ -2339,6 +2339,8 @@ def _sharding_constraint_hlo_lowering(ctx, x_node, *, sharding,
   # NamedSharding. So update the NamedSharding to have the manual axes.
   if isinstance(axis_ctx, sharding_impls.SPMDAxisContext):
     mesh = resource_env.physical_mesh
+    if mesh.empty and isinstance(sharding, NamedSharding):
+      mesh = sharding.mesh
     parsed_pspec = parse_flatten_op_sharding(
         sharding._to_xla_hlo_sharding(aval.ndim), mesh)[0]
     sharding = NamedSharding._from_parsed_pspec(
diff --git a/jax/_src/sharding_impls.py b/jax/_src/sharding_impls.py
@@ -136,10 +136,16 @@ def is_equivalent_to(self: XLACompatibleSharding,  # type: ignore
 
 
 @functools.lru_cache
-def _check_mesh_resource_axis(mesh, parsed_pspec):
+def _check_mesh_resource_axis(mesh, parsed_pspec, _manual_axes):
   try:
-    [mesh.shape[r] for p in parsed_pspec if p is not None
-     for r in p]
+    for p in parsed_pspec:
+      if p is not None:
+        for r in p:
+          mesh.shape[r]
+          if r in _manual_axes:
+            raise ValueError(
+                f"Axis: {r} of {parsed_pspec.get_partition_spec()} "
+                f"is also found in manual_axes: {_manual_axes}.") from None
   except KeyError as e:
     raise ValueError(f"Resource axis: {e.args[0]} of {parsed_pspec.user_spec} is "
                      "undefined.") from None
@@ -184,6 +190,10 @@ def named_sharding_to_xla_hlo_sharding(
     axis_names = self.mesh.axis_names
     for manual_axis in self._manual_axes:
       special_axes[axis_names.index(manual_axis)] = xc.OpSharding.Type.MANUAL
+      if xla_extension_version < 259:
+        if manual_axis in array_mapping:   # type: ignore
+          raise ValueError(f"manual axis {repr(manual_axis)} in {repr(self)} "
+                           "cannot be used as a sharded axis")
 
   replicated_mesh_axes = []
   for i, (axis_name, axis_val) in enumerate(mesh_shape.items()):
@@ -1105,7 +1115,7 @@ def __repr__(self):
             f"sync={self.sync})")
 
 
-def preprocess(mesh, spec, parsed_pspec):
+def preprocess(mesh, spec, parsed_pspec, _manual_axes=frozenset()):
   # This split exists because you can pass `_parsed_pspec` that has been
   # modified from the original. For example: Adding extra dimension to
   # axis_resources for vmap handlers. In such cases you need to preserve the
@@ -1118,9 +1128,11 @@ def preprocess(mesh, spec, parsed_pspec):
         PartitionSpec() if spec is None else spec,
         "NamedSharding spec", allow_unconstrained_dims=True)
 
-  _check_mesh_resource_axis(mesh, parsed_pspec)
+  _check_mesh_resource_axis(mesh, parsed_pspec, _manual_axes)
   return parsed_pspec
 
+# fallback for c++ .
+preprocess_with_manual = preprocess
 
 def prepare_axis_resources(axis_resources,
                            arg_name,
diff --git a/jax/experimental/shard_map.py b/jax/experimental/shard_map.py
@@ -93,9 +93,12 @@ def _shard_map(f: Callable, mesh: Mesh, in_specs: Specs,
   if not isinstance(mesh, Mesh):
     raise TypeError("shard_map requires a `jax.sharding.Mesh` instance for its "
                     f"second argument, but got {mesh} of type {type(mesh)}.")
-  _check_specs(SpecErrorType.input, in_specs)
+  if not auto.issubset(mesh.axis_names):
+    raise ValueError(f"shard_map requires auto={auto} to be a subset of "
+                     f"mesh.axis_names={mesh.axis_names}")
+  _check_specs(SpecErrorType.input, in_specs, auto)
   if not callable(out_specs):
-    _check_specs(SpecErrorType.out, out_specs)
+    _check_specs(SpecErrorType.out, out_specs, auto)
 
   @util.wraps(f)
   @traceback_util.api_boundary
@@ -114,7 +117,7 @@ def wrapped(*args):
     def out_names_thunk():
       if callable(out_specs):
         out_specs_ = out_specs()
-        _check_specs(SpecErrorType.out, out_specs_)
+        _check_specs(SpecErrorType.out, out_specs_, auto)
       else:
         out_specs_ = out_specs
       dummy = tree_unflatten(out_tree(), [object()] * out_tree().num_leaves)
@@ -162,17 +165,40 @@ def _canonicalize_spec(spec: PartitionSpec) -> AxisNames:
 
 SpecErrorType = enum.Enum('SpecErrorType', ['input', 'out'])
 
-def _check_specs(error_type: SpecErrorType, specs: Any) -> None:
+def _check_specs(error_type: SpecErrorType, specs: Any, auto) -> None:
   if error_type == SpecErrorType.input and specs is None:
     raise TypeError(
         "shard_map in_specs argument must be a pytree of "
         "`jax.sharding.PartitionSpec` instances, but it was None.\n"
         "Instead of `in_specs=None`, did you mean `in_specs=P()`, "
         "where `P = jax.sharding.PartitionSpec`?")
-  if all(isinstance(p, PartitionSpec) for p in tree_leaves(specs)): return
+  def check_spec(p):
+    if not isinstance(p, PartitionSpec):
+      return False
+    for names in p:
+      if not isinstance(names, tuple):
+        names = (names,)
+      for name in names:
+        if name in auto:
+          return False
+    return True
+  if all(check_spec(p) for p in tree_leaves(specs)): return
   prefix = 'in' if error_type == SpecErrorType.input else 'out'
   msgs = [f"  {prefix}_specs{keystr(key)} is {x} of type {type(x).__name__}, "
           for key, x in generate_key_paths(specs) if not isinstance(x, P)]
+  if not msgs:
+    for key, p in generate_key_paths(specs):
+      for names in p:
+        if not isinstance(names, tuple):
+          names = (names,)
+        for name in names:
+          if name in auto:
+            msgs.append(f"  {prefix}_specs{keystr(key)} refers to {repr(name)}")
+    raise ValueError(
+        f"shard_map {prefix}_specs argument cannot refer to an axis "
+        f"marked auto ({auto}), but:\n\n"
+        + '\n\n'.join(msgs) + '\n\n'
+        f"Check the {prefix}_specs values passed to shard_map.")
   raise TypeError(
       f"shard_map {prefix}_specs argument must be a pytree of "
       f"`jax.sharding.PartitionSpec` instances, but:\n\n"
@@ -549,7 +575,7 @@ def _shard_map_lowering(ctx, *in_nodes, jaxpr, mesh, in_names, out_names,
   in_nodes_ = map(partial(_xla_shard, ctx, mesh, auto), in_names, ctx.avals_in,
                   in_avals_, in_nodes)
   new_axis_context = sharding_impls.SPMDAxisContext(
-      mesh, frozenset(mesh.axis_names)
+      mesh, frozenset(mesh.axis_names) - auto
   )
   sub_ctx = ctx.module_context.replace(axis_context=new_axis_context)
   with core.extend_axis_env_nd(tuple(mesh.shape.items())):
@@ -575,20 +601,20 @@ def _xla_shard(ctx: mlir.LoweringRuleContext, mesh, auto, names,
   unspecified = set(range(aval_in.ndim)) if auto else set()
   sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, shard_proto,  # type: ignore
                                   unspecified_dims=unspecified)
-  return [mlir.wrap_with_full_to_shard_op(ctx, sx, aval_out, manual_proto, set())]
+  return [mlir.wrap_with_full_to_shard_op(ctx, sx, aval_out, manual_proto, unspecified)]
 
 def _xla_unshard(ctx: mlir.LoweringRuleContext, mesh, auto, names,
                  aval_in, aval_out, xs):
   x, = xs
-  manual_proto = pxla.manual_proto(aval_in, frozenset(mesh.axis_names) - auto, mesh)
-  sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, manual_proto, unspecified_dims=set())
   axes = {name: i for i, ns in names.items() for name in ns}
   ns = NamedSharding(mesh, sharding_impls.array_mapping_to_axis_resources(axes))  # type: ignore
   if dtypes.issubdtype(aval_out.dtype, dtypes.extended):
     ns = aval_out.dtype._rules.physical_sharding(aval_out, ns)
     aval_out = core.physical_aval(aval_out)
-  shard_proto = ns._to_xla_hlo_sharding(aval_out.ndim).to_proto()
   unspecified = set(range(aval_out.ndim)) if auto else set()
+  manual_proto = pxla.manual_proto(aval_in, frozenset(mesh.axis_names) - auto, mesh)
+  sx = mlir.wrap_with_sharding_op(ctx, x, aval_in, manual_proto, unspecified_dims=unspecified)
+  shard_proto = ns._to_xla_hlo_sharding(aval_out.ndim).to_proto()
   return mlir.wrap_with_shard_to_full_op(ctx, sx, aval_out, shard_proto,
                                          unspecified)  # type: ignore
 
diff --git a/tests/shard_map_test.py b/tests/shard_map_test.py
@@ -1625,6 +1625,97 @@ def f(inputs):
     jtu.check_grads(f, (list(jnp.arange(float(num_args))[:,None]),), order=1,
                     modes=['rev'], atol=1e-3, rtol=1e-3)
 
+  def test_partial_auto(self):
+    mesh = jtu.create_global_mesh((2, 2), ('i', 'j'))
+
+    def g(x):
+      x = jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P(None, 'j')))
+      return x * x
+
+    @jax.jit
+    def f(x):
+      x = shard_map(g, mesh,
+                    in_specs=P('i', None),
+                    out_specs=P('i', None),
+                    check_rep=False,
+                    auto=frozenset({'j'}))(x)
+      return jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+
+    v = jnp.arange(32.).reshape(4, 8)
+    v = jax.device_put(v, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+    self.assertAllClose(v*v, f(v), check_dtypes=False)
+
+  def test_partial_auto_error_wsc_manual(self):
+    mesh = jtu.create_global_mesh((2, 2), ('i', 'j'))
+
+    def g(x):
+      x = jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+      return x * x
+
+    @jax.jit
+    def f(x):
+      x = shard_map(g, mesh,
+                    in_specs=P('i', None),
+                    out_specs=P('i', None),
+                    check_rep=False,
+                    auto=frozenset({'j'}))(x)
+      return jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+
+    v = jnp.arange(32.).reshape(4, 8)
+    v = jax.device_put(v, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+    with self.assertRaisesRegex(ValueError, "manual"):
+      f(v)
+
+  def test_partial_auto_error_invalid_auto(self):
+    mesh = jtu.create_global_mesh((2, 2), ('i', 'j'))
+
+    def g(x):
+      x = jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+      return x * x
+
+    @jax.jit
+    def f(x):
+      x = shard_map(g, mesh,
+                    in_specs=P('i', None),
+                    out_specs=P('i', None),
+                    check_rep=False,
+                    auto=frozenset({'k'}))(x)
+      return jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+
+    v = jnp.arange(32.).reshape(4, 8)
+    v = jax.device_put(v, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+    with self.assertRaisesRegex(ValueError, "to be a subset of mesh.axis_names"):
+      f(v)
+
+  def test_partial_auto_error_wrong_in_specs(self):
+    mesh = jtu.create_global_mesh((2, 2), ('i', 'j'))
+
+    def g(x):
+      x = jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+      return x * x
+
+    @jax.jit
+    def f(x):
+      x = shard_map(g, mesh,
+                    in_specs=P('i', 'j'),
+                    out_specs=P('i', None),
+                    check_rep=False,
+                    auto=frozenset({'j'}))(x)
+      return jax.lax.with_sharding_constraint(
+          x, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+
+    v = jnp.arange(32.).reshape(4, 8)
+    v = jax.device_put(v, jax.sharding.NamedSharding(mesh, P('i', 'j')))
+    with self.assertRaisesRegex(ValueError, "in_specs refers to 'j'"):
+      f(v)
+
 
 class FunSpec(NamedTuple):
   name: str