[Feature] Benchmark storage types (#633)

adityagoel4512 · Adi Goel · vmoens · web-flow · commit 49af5da9e0d2 · 2022-11-04T14:30:39.000Z
* Distributed replay buffer prototype * Fixes comment issue * Makes ReplayBufferNode subclass TensorDictReplayBuffer * aha * amend * bf * Fixes print statements and removes redundant Collector arg * Fixes print statements and removes redundant Collector arg * amend * amend * Timing larger tensordict transfers over torch rpc using multiple storage types * init * amend * amend * update_ and make sure content is read * amend * amend * Fixes list storage arg * Moves benchmark to new top-level directory and adds note in documentation about speed up using MemmapTensor * Removes analysis.ipynb * Removes accidental edit to tensordict.py * Updates data.rst text * Removes redundant variable * Removes hack to get list read to work * replace assert_allclose with assert_close (#644) * Adds small note illustrating example usage Co-authored-by: Adi Goel <adityagoel@fb.com> Co-authored-by: vmoens <vincentmoens@gmail.com> Co-authored-by: sosmond <35877775+sosmond@users.noreply.github.com>
diff --git a/benchmarks/storage/benchmark_sample_latency_over_rpc.py b/benchmarks/storage/benchmark_sample_latency_over_rpc.py
@@ -0,0 +1,187 @@
+"""
+Sample latency benchmarking (using RPC)
+======================================
+A rough benchmark of sample latency using different storage types over the network using `torch.rpc`.
+Run this script with --rank=0 and --rank=1 flags set in separate processes - these ranks correspond to the trainer worker and buffer worker respectively, and both need to be initialised.
+e.g. to benchmark LazyMemmapStorage, run the following commands using either two separate shells or multiprocessing.
+    - python3 benchmark_sample_latency_over_rpc.py --rank=0 --storage=LazyMemmapStorage
+    - python3 benchmark_sample_latency_over_rpc.py --rank=1 --storage=LazyMemmapStorage
+This code is based on examples/distributed/distributed_replay_buffer.py.
+"""
+import argparse
+import os
+import pickle
+import sys
+import time
+import timeit
+from datetime import datetime
+
+import torch
+import torch.distributed.rpc as rpc
+from torchrl.data.replay_buffers.rb_prototype import RemoteTensorDictReplayBuffer
+from torchrl.data.replay_buffers.samplers import RandomSampler
+from torchrl.data.replay_buffers.storages import (
+    LazyMemmapStorage,
+    LazyTensorStorage,
+    ListStorage,
+)
+from torchrl.data.replay_buffers.writers import RoundRobinWriter
+from torchrl.data.tensordict import TensorDict
+
+RETRY_LIMIT = 2
+RETRY_DELAY_SECS = 3
+REPLAY_BUFFER_NODE = "ReplayBuffer"
+TRAINER_NODE = "Trainer"
+TENSOR_SIZE = 3 * 86 * 86
+BUFFER_SIZE = 1001
+BATCH_SIZE = 256
+REPEATS = 1000
+
+storage_options = {
+    "LazyMemmapStorage": LazyMemmapStorage,
+    "LazyTensorStorage": LazyTensorStorage,
+    "ListStorage": ListStorage,
+}
+
+storage_arg_options = {
+    "LazyMemmapStorage": dict(scratch_dir="/tmp/", device=torch.device("cpu")),
+    "LazyTensorStorage": dict(),
+    "ListStorage": dict(),
+}
+parser = argparse.ArgumentParser(
+    description="RPC Replay Buffer Example",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+)
+
+parser.add_argument(
+    "--rank",
+    type=int,
+    default=-1,
+    help="Node Rank [0 = Replay Buffer, 1 = Dummy Trainer, 2+ = Dummy Data Collector]",
+)
+
+parser.add_argument(
+    "--storage",
+    type=str,
+    default="LazyMemmapStorage",
+    help="Storage type [LazyMemmapStorage, LazyTensorStorage, ListStorage]",
+)
+
+
+class DummyTrainerNode:
+    def __init__(self) -> None:
+        self.id = rpc.get_worker_info().id
+        self.replay_buffer = self._create_replay_buffer()
+        self._ret = None
+
+    def train(self, batch_size: int) -> None:
+        start_time = timeit.default_timer()
+        ret = rpc.rpc_sync(
+            self.replay_buffer.owner(),
+            ReplayBufferNode.sample,
+            args=(self.replay_buffer, batch_size),
+        )
+        if storage_type == "ListStorage":
+            self._ret = ret[0]
+        else:
+            if self._ret is None:
+                self._ret = ret
+            else:
+                self._ret[0].update_(ret[0])
+        # make sure the content is read
+        self._ret[0]["observation"] + 1
+        self._ret[0]["next_observation"] + 1
+        return timeit.default_timer() - start_time
+
+    def _create_replay_buffer(self) -> rpc.RRef:
+        while True:
+            try:
+                replay_buffer_info = rpc.get_worker_info(REPLAY_BUFFER_NODE)
+                buffer_rref = rpc.remote(
+                    replay_buffer_info, ReplayBufferNode, args=(1000000,)
+                )
+                print(f"Connected to replay buffer {replay_buffer_info}")
+                return buffer_rref
+            except Exception:
+                print("Failed to connect to replay buffer")
+                time.sleep(RETRY_DELAY_SECS)
+
+
+class ReplayBufferNode(RemoteTensorDictReplayBuffer):
+    def __init__(self, capacity: int):
+        super().__init__(
+            storage=storage_options[storage_type](
+                max_size=capacity, **storage_arg_options[storage_type]
+            ),
+            sampler=RandomSampler(),
+            writer=RoundRobinWriter(),
+            collate_fn=lambda x: x,
+        )
+        tds = TensorDict(
+            {
+                "observation": torch.randn(
+                    BUFFER_SIZE,
+                    TENSOR_SIZE,
+                ),
+                "next_observation": torch.randn(
+                    BUFFER_SIZE,
+                    TENSOR_SIZE,
+                ),
+            },
+            batch_size=[BUFFER_SIZE],
+        )
+        self.extend(tds)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    rank = args.rank
+    storage_type = args.storage
+
+    print(f"Rank: {rank}; Storage: {storage_type}")
+
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29500"
+    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
+    options = rpc.TensorPipeRpcBackendOptions(
+        num_worker_threads=16, init_method="tcp://localhost:10002", rpc_timeout=120
+    )
+    if rank == 0:
+        # rank 0 is the trainer
+        rpc.init_rpc(
+            TRAINER_NODE,
+            rank=rank,
+            backend=rpc.BackendType.TENSORPIPE,
+            rpc_backend_options=options,
+        )
+        trainer = DummyTrainerNode()
+        results = []
+        for i in range(REPEATS):
+            result = trainer.train(batch_size=BATCH_SIZE)
+            if i == 0:
+                continue
+            results.append(result)
+            print(i, results[-1])
+
+        with open(
+            f'./benchmark_{datetime.now().strftime("%d-%m-%Y%H:%M:%S")};batch_size={BATCH_SIZE};tensor_size={TENSOR_SIZE};repeat={REPEATS};storage={storage_type}.pkl',
+            "wb+",
+        ) as f:
+            pickle.dump(results, f)
+
+        tensor_results = torch.tensor(results)
+        print(f"Mean: {torch.mean(tensor_results)}")
+        breakpoint()
+    elif rank == 1:
+        # rank 1 is the replay buffer
+        # replay buffer waits passively for construction instructions from trainer node
+        rpc.init_rpc(
+            REPLAY_BUFFER_NODE,
+            rank=rank,
+            backend=rpc.BackendType.TENSORPIPE,
+            rpc_backend_options=options,
+        )
+        breakpoint()
+    else:
+        sys.exit(1)
+    rpc.shutdown()
diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst
@@ -43,6 +43,20 @@ We also provide a prototyped composable replay buffer.
     torchrl.data.replay_buffers.writers.Writer
     torchrl.data.replay_buffers.writers.RoundRobinWriter
 
+Storage choice is very influential on replay buffer sampling latency, especially in distributed reinforcement learning settings with larger data volumes.
+:class:`LazyMemmapStorage` is highly advised in distributed settings with shared storage due to the lower serialisation cost of MemmapTensors as well as the ability to specify file storage locations for improved node failure recovery.
+The following mean sampling latency improvements over using ListStorage were found from rough benchmarking in https://github.com/pytorch/rl/tree/main/benchmarks/storage.
+
++-------------------------------+-----------+
+| Storage Type                  | Speed up  |
+|                               |           |
++===============================+===========+
+| :class:`ListStorage`          | 1x        |
++-------------------------------+-----------+
+| :class:`LazyTensorStorage`    | 1.83x     |
++-------------------------------+-----------+
+| :class:`LazyMemmapStorage`    | 3.44x     |
++-------------------------------+-----------+
 
 
 TensorDict
diff --git a/torchrl/data/replay_buffers/rb_prototype.py b/torchrl/data/replay_buffers/rb_prototype.py
@@ -175,7 +175,6 @@ def collate_fn(x):
                 return stack_td(x, 0, contiguous=True)
 
             kw["collate_fn"] = collate_fn
-
         super().__init__(**kw)
         self.priority_key = priority_key
 
@@ -232,7 +231,7 @@ def extend(self, tensordicts: Union[List, TensorDictBase]) -> torch.Tensor:
             torch.tensor(index, dtype=torch.int, device=stacked_td.device),
             inplace=True,
         )
-        self.update_tensordict_priority(tensordicts)
+        self.update_tensordict_priority(stacked_td)
         return index
 
     def update_tensordict_priority(self, data: TensorDictBase) -> None: