[Feature] Loading R3M and VIP from ResNet (#863)

vmoens · web-flow · commit f741debedb42 · 2023-01-24T22:04:45.000Z
diff --git a/test/test_transforms.py b/test/test_transforms.py
@@ -1677,6 +1677,10 @@ def test_pin_mem(self, device):
         td = TensorDict(
             {key: torch.randn(3) for key in ["a", "b", "c"]}, [], device=device
         )
+        if device.type == "cuda":
+            with pytest.raises(RuntimeError, match="cannot pin"):
+                pin_mem(td)
+            return
         pin_mem(td)
         for item in td.values():
             assert item.is_pinned
diff --git a/torchrl/envs/transforms/r3m.py b/torchrl/envs/transforms/r3m.py
@@ -34,6 +34,22 @@
 except ImportError:
     _has_tv = False
 
+try:
+    from torchvision.models import ResNet18_Weights, ResNet34_Weights, ResNet50_Weights
+    from torchvision.models._api import WeightsEnum
+except ImportError:
+
+    class WeightsEnum:  # noqa: D101
+        # placeholder
+        pass
+
+
+R3M_MODEL_MAP = {
+    "resnet18": "r3m_18",
+    "resnet34": "r3m_34",
+    "resnet50": "r3m_50",
+}
+
 
 class _R3MNet(Transform):
 
@@ -45,18 +61,19 @@ def __init__(self, in_keys, out_keys, model_name, del_keys: bool = True):
                 "Tried to instantiate R3M without torchvision. Make sure you have "
                 "torchvision installed in your environment."
             )
+        self.model_name = model_name
         if model_name == "resnet18":
-            self.model_name = "r3m_18"
+            # self.model_name = "r3m_18"
             self.outdim = 512
-            convnet = models.resnet18(pretrained=False)
+            convnet = models.resnet18(None)
         elif model_name == "resnet34":
-            self.model_name = "r3m_34"
+            # self.model_name = "r3m_34"
             self.outdim = 512
-            convnet = models.resnet34(pretrained=False)
+            convnet = models.resnet34(None)
         elif model_name == "resnet50":
-            self.model_name = "r3m_50"
+            # self.model_name = "r3m_50"
             self.outdim = 2048
-            convnet = models.resnet50(pretrained=False)
+            convnet = models.resnet50(None)
         else:
             raise NotImplementedError(
                 f"model {model_name} is currently not supported by R3M"
@@ -123,8 +140,34 @@ def _load_weights(model_name, r3m_instance, dir_prefix):
         state_dict = td_flatten.to_dict()
         r3m_instance.convnet.load_state_dict(state_dict)
 
-    def load_weights(self, dir_prefix=None):
-        self._load_weights(self.model_name, self, dir_prefix)
+    def load_weights(self, dir_prefix=None, tv_weights=None):
+        if dir_prefix is not None and tv_weights is not None:
+            raise RuntimeError(
+                "torchvision weights API does not allow for custom download path."
+            )
+        elif tv_weights is not None:
+            model_name = self.model_name
+            if model_name == "resnet18":
+                if isinstance(tv_weights, str):
+                    tv_weights = getattr(ResNet18_Weights, tv_weights)
+                convnet = models.resnet18(weights=tv_weights)
+            elif model_name == "resnet34":
+                if isinstance(tv_weights, str):
+                    tv_weights = getattr(ResNet34_Weights, tv_weights)
+                convnet = models.resnet34(weights=tv_weights)
+            elif model_name == "resnet50":
+                if isinstance(tv_weights, str):
+                    tv_weights = getattr(ResNet50_Weights, tv_weights)
+                convnet = models.resnet50(weights=tv_weights)
+            else:
+                raise NotImplementedError(
+                    f"model {model_name} is currently not supported by R3M"
+                )
+            convnet.fc = Identity()
+            self.convnet.load_state_dict(convnet.state_dict())
+        else:
+            model_name = R3M_MODEL_MAP[self.model_name]
+            self._load_weights(model_name, self, dir_prefix)
 
 
 def _init_first(fun):
@@ -154,7 +197,7 @@ class R3MTransform(Compose):
     can ensure that the following code snippet works as expected:
 
     Examples:
-        >>> transform = R3MTransform("resenet50", in_keys=["pixels"])
+        >>> transform = R3MTransform("resnet50", in_keys=["pixels"])
         >>> env.append_transform(transform)
         >>> # the forward method will first call _init which will look at env.observation_spec
         >>> env.reset()
@@ -170,8 +213,13 @@ class R3MTransform(Compose):
         stack_images (bool, optional): if False, the images given in the :obj:`in_keys`
              argument will be treaded separetely and each will be given a single,
              separated entry in the output tensordict. Defaults to :obj:`True`.
-        download (bool, optional): if True, the weights will be downloaded using
-            the torch.hub download API (i.e. weights will be cached for future use).
+        download (bool, torchvision Weights config or corresponding string):
+            if True, the weights will be downloaded using the torch.hub download
+            API (i.e. weights will be cached for future use).
+            These weights are the original weights from the R3M publication.
+            If the torchvision weights are needed, there are two ways they can be
+            obtained: :obj:`download=ResNet50_Weights.IMAGENET1K_V1` or :obj:`download="IMAGENET1K_V1"`
+            where :obj:`ResNet50_Weights` can be imported via :obj:`from torchvision.models import resnet50, ResNet50_Weights`.
             Defaults to False.
         download_path (str, optional): path where to download the models.
             Default is None (cache path determined by torch.hub utils).
@@ -194,7 +242,7 @@ def __init__(
         out_keys: List[str] = None,
         size: int = 244,
         stack_images: bool = True,
-        download: bool = False,
+        download: Union[bool, WeightsEnum, str] = False,
         download_path: Optional[str] = None,
         tensor_pixels_keys: List[str] = None,
     ):
@@ -302,8 +350,12 @@ def _init(self):
 
         for transform in transforms:
             self.append(transform)
-        if self.download:
-            self[-1].load_weights(dir_prefix=self.download_path)
+        if self.download is True:
+            self[-1].load_weights(dir_prefix=self.download_path, tv_weights=None)
+        elif self.download:
+            self[-1].load_weights(
+                dir_prefix=self.download_path, tv_weights=self.download
+            )
 
         if self._device is not None:
             self.to(self._device)
diff --git a/torchrl/envs/transforms/vip.py b/torchrl/envs/transforms/vip.py
@@ -34,6 +34,20 @@
 except ImportError:
     _has_tv = False
 
+try:
+    from torchvision.models import ResNet50_Weights
+    from torchvision.models._api import WeightsEnum
+except ImportError:
+
+    class WeightsEnum:  # noqa: D101
+        # placeholder
+        pass
+
+
+VIP_MODEL_MAP = {
+    "resnet50": "vip_50",
+}
+
 
 class _VIPNet(Transform):
 
@@ -45,8 +59,8 @@ def __init__(self, in_keys, out_keys, model_name="resnet50", del_keys: bool = Tr
                 "Tried to instantiate VIP without torchvision. Make sure you have "
                 "torchvision installed in your environment."
             )
+        self.model_name = model_name
         if model_name == "resnet50":
-            self.model_name = "vip_50"
             self.outdim = 2048
             convnet = models.resnet50(pretrained=False)
             convnet.fc = torch.nn.Linear(self.outdim, 1024)
@@ -98,8 +112,8 @@ def transform_observation_spec(self, observation_spec: TensorSpec) -> TensorSpec
 
     @staticmethod
     def _load_weights(model_name, vip_instance, dir_prefix):
-        if model_name not in ("vip_50"):
-            raise ValueError("model_name should be 'vip_50'")
+        if model_name not in ("vip_50",):
+            raise ValueError(f"model_name should be 'vip_50', got {model_name}")
         url = "https://pytorch.s3.amazonaws.com/models/rl/vip/model.pt"
         d = load_state_dict_from_url(
             url,
@@ -112,8 +126,27 @@ def _load_weights(model_name, vip_instance, dir_prefix):
         state_dict = td_flatten.to_dict()
         vip_instance.convnet.load_state_dict(state_dict)
 
-    def load_weights(self, dir_prefix=None):
-        self._load_weights(self.model_name, self, dir_prefix)
+    def load_weights(self, dir_prefix=None, tv_weights=None):
+        if dir_prefix is not None and tv_weights is not None:
+            raise RuntimeError(
+                "torchvision weights API does not allow for custom download path."
+            )
+        elif tv_weights is not None:
+            model_name = self.model_name
+            if model_name == "resnet50":
+                if isinstance(tv_weights, str):
+                    tv_weights = getattr(ResNet50_Weights, tv_weights)
+                convnet = models.resnet50(weights=tv_weights)
+            else:
+                raise NotImplementedError(
+                    f"model {model_name} is currently not supported by R3M"
+                )
+            convnet.fc = torch.nn.Linear(self.outdim, 1024)
+            self.convnet.load_state_dict(convnet.state_dict())
+
+        else:
+            model_name = VIP_MODEL_MAP[self.model_name]
+            self._load_weights(model_name, self, dir_prefix)
 
 
 def _init_first(fun):
@@ -145,8 +178,13 @@ class VIPTransform(Compose):
         stack_images (bool, optional): if False, the images given in the :obj:`in_keys`
              argument will be treaded separetely and each will be given a single,
              separated entry in the output tensordict. Defaults to :obj:`True`.
-        download (bool, optional): if True, the weights will be downloaded using
-            the torch.hub download API (i.e. weights will be cached for future use).
+        download (bool, torchvision Weights config or corresponding string):
+            if True, the weights will be downloaded using the torch.hub download
+            API (i.e. weights will be cached for future use).
+            These weights are the original weights from the VIP publication.
+            If the torchvision weights are needed, there are two ways they can be
+            obtained: :obj:`download=ResNet50_Weights.IMAGENET1K_V1` or :obj:`download="IMAGENET1K_V1"`
+            where :obj:`ResNet50_Weights` can be imported via :obj:`from torchvision.models import resnet50, ResNet50_Weights`.
             Defaults to False.
         download_path (str, optional): path where to download the models.
             Default is None (cache path determined by torch.hub utils).
@@ -169,7 +207,7 @@ def __init__(
         out_keys: List[str] = None,
         size: int = 244,
         stack_images: bool = True,
-        download: bool = False,
+        download: Union[bool, WeightsEnum, str] = False,
         download_path: Optional[str] = None,
         tensor_pixels_keys: List[str] = None,
     ):
@@ -275,34 +313,18 @@ def _init(self):
 
         for transform in transforms:
             self.append(transform)
-        if self.download:
-            self[-1].load_weights(dir_prefix=self.download_path)
+        if self.download is True:
+            self[-1].load_weights(dir_prefix=self.download_path, tv_weights=None)
+        elif self.download:
+            self[-1].load_weights(
+                dir_prefix=self.download_path, tv_weights=self.download
+            )
 
         if self._device is not None:
             self.to(self._device)
         if self._dtype is not None:
             self.to(self._dtype)
 
-    @property
-    def is_3d(self):
-        """Whether the input image has 3 dims (no-batched) or more.
-
-        If no parent environment exists, it defaults to True.
-
-        The main usage is this: if there are more than one image and they need to be
-        stacked, we must know if the input image has dim 3 or 4. If 3, we need to unsqueeze
-        before stacking. If 4, we can cat along the first dimension.
-
-        """
-        if self._is_3d is None:
-            parent = self.parent
-            if parent is None:
-                return True
-            for key in parent.observation_spec.keys():
-                self._is_3d = len(parent.observation_spec[key].shape) == 3
-                break
-        return self._is_3d
-
     def to(self, dest: Union[DEVICE_TYPING, torch.dtype]):
         if isinstance(dest, torch.dtype):
             self._dtype = dest