Skip to content

Fix H, W ordering for xy indexing in ROPE #2541

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 9, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions timm/layers/pos_embed_sincos.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ def build_sincos2d_pos_embed(
return pos_emb.to(dtype=dtype)


def swap_shape_xy(seq: List[int]) -> List[int]:
if len(seq) < 2:
return seq
return [seq[1], seq[0]] + list(seq[2:])


def build_fourier_pos_embed(
feat_shape: List[int],
bands: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -134,6 +140,11 @@ def build_fourier_pos_embed(
if dtype is None:
dtype = bands.dtype

if grid_indexing == 'xy':
feat_shape = swap_shape_xy(feat_shape)
if ref_feat_shape is not None:
ref_feat_shape = swap_shape_xy(ref_feat_shape)

if in_pixels:
t = [
torch.linspace(-1., 1., steps=s, device=device, dtype=torch.float32)
Expand Down Expand Up @@ -516,15 +527,16 @@ def init_random_2d_freqs(
@torch.fx.wrap
@register_notrace_function
def get_mixed_grid(
height: int,
width: int,
shape: List[int],
grid_indexing: str = 'ij',
device: Optional[torch.device] = None,
dtype: torch.dtype = torch.float32,
) -> Tuple[torch.Tensor, torch.Tensor]:
if grid_indexing == 'xy':
shape = swap_shape_xy(shape)
x_pos, y_pos = torch.meshgrid(
torch.arange(height, dtype=dtype, device=device),
torch.arange(width, dtype=dtype, device=device),
torch.arange(shape[0], dtype=dtype, device=device),
torch.arange(shape[1], dtype=dtype, device=device),
indexing=grid_indexing,
)
t_x = x_pos.flatten()
Expand Down Expand Up @@ -599,8 +611,7 @@ def __init__(
if feat_shape is not None:
# cache pre-computed grid
t_x, t_y = get_mixed_grid(
feat_shape[0],
feat_shape[1],
feat_shape,
grid_indexing=grid_indexing,
device=self.freqs.device
)
Expand All @@ -620,8 +631,7 @@ def get_embed(self, shape: Optional[List[int]] = None) -> torch.Tensor:
"""
if shape is not None:
t_x, t_y = get_mixed_grid(
shape[0],
shape[1],
shape,
grid_indexing=self.grid_indexing,
device=self.freqs.device
)
Expand Down