pytorch
diff --git a/‎test/services/test_python_executor_service.py‎
Lines changed: 9 additions & 7 deletions b/‎test/services/test_python_executor_service.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎test/test_collector.py‎
Lines changed: 112 additions & 13 deletions b/‎test/test_collector.py‎
Lines changed: 112 additions & 13 deletions
diff --git a/‎torchrl/collectors/__init__.py‎
Lines changed: 6 additions & 7 deletions b/‎torchrl/collectors/__init__.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎torchrl/collectors/_constants.py‎
Lines changed: 84 additions & 0 deletions b/‎torchrl/collectors/_constants.py‎
Lines changed: 84 additions & 0 deletions
@@ -73,7 +73,7 @@ def test_service_execution(self, ray_init):
 result = x + y
 print(f"Result: {result}")
 """
-            result = ray.get(executor.execute.remote(code), timeout=2)
+            result = ray.get(executor.execute.remote(code), timeout=10)
 
             assert result["success"] is True
             assert "Result: 30" in result["stdout"]
@@ -101,7 +101,7 @@ def test_service_execution_error(self, ray_init):
 
             # Execute code with an error
             code = "raise ValueError('Test error')"
-            result = ray.get(executor.execute.remote(code), timeout=2)
+            result = ray.get(executor.execute.remote(code), timeout=10)
 
             assert result["success"] is False
             assert "ValueError: Test error" in result["stderr"]
@@ -119,7 +119,7 @@ def test_multiple_executions(self, ray_init):
                 "python_executor",
                 PythonExecutorService,
                 pool_size=4,
-                timeout=5.0,
+                timeout=10.0,
                 num_cpus=4,
                 max_concurrency=4,
             )
@@ -132,14 +132,16 @@ def test_multiple_executions(self, ray_init):
                 code = f"print('Execution {i}')"
                 futures.append(executor.execute.remote(code))
 
-            # Wait for all to complete
-            results = ray.get(futures, timeout=5)
+            # Wait for all to complete with longer timeout
+            results = ray.get(futures, timeout=30)
 
             # All should succeed
             assert len(results) == 8
             for i, result in enumerate(results):
-                assert result["success"] is True
-                assert f"Execution {i}" in result["stdout"]
+                assert result["success"] is True, f"Execution {i} failed: {result}"
+                assert (
+                    f"Execution {i}" in result["stdout"]
+                ), f"Expected 'Execution {i}' in stdout, got: {result['stdout']!r}"
 
         finally:
             services.reset()
 
@@ -13,11 +13,14 @@
 import subprocess
 import sys
 import time
+from contextlib import nullcontext
 from unittest.mock import patch
 
 import numpy as np
 import pytest
 import torch
+
+import torchrl.collectors._runner
 from packaging import version
 from tensordict import (
     assert_allclose_td,
@@ -33,7 +36,6 @@
     TensorDictSequential,
 )
 from torch import nn
-
 from torchrl._utils import (
     _make_ordinal_device,
     _replace_last,
@@ -48,7 +50,7 @@
     SyncDataCollector,
     WeightUpdaterBase,
 )
-from torchrl.collectors.collectors import _Interruptor
+from torchrl.collectors._constants import _Interruptor
 
 from torchrl.collectors.utils import split_trajectories
 from torchrl.data import (
@@ -1487,12 +1489,14 @@ def env_fn(seed):
         assert_allclose_td(data10, data20)
 
     @pytest.mark.parametrize("use_async", [False, True])
-    @pytest.mark.parametrize("cudagraph", [False, True])
+    @pytest.mark.parametrize(
+        "cudagraph", [False, True] if torch.cuda.is_available() else [False]
+    )
     @pytest.mark.parametrize(
         "weight_sync_scheme",
         [None, MultiProcessWeightSyncScheme, SharedMemWeightSyncScheme],
     )
-    @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
+    # @pytest.mark.skipif(not torch.cuda.is_available() and not torch.mps.is_available(), reason="no cuda/mps device found")
     def test_update_weights(self, use_async, cudagraph, weight_sync_scheme):
         def create_env():
             return ContinuousActionVecMockEnv()
@@ -1509,11 +1513,12 @@ def create_env():
         kwargs = {}
         if weight_sync_scheme is not None:
             kwargs["weight_sync_schemes"] = {"policy": weight_sync_scheme()}
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
         collector = collector_class(
             [create_env] * 3,
             policy=policy,
-            device=[torch.device("cuda:0")] * 3,
-            storing_device=[torch.device("cuda:0")] * 3,
+            device=[torch.device(device)] * 3,
+            storing_device=[torch.device(device)] * 3,
             frames_per_batch=20,
             cat_results="stack",
             cudagraph_policy=cudagraph,
@@ -1544,7 +1549,9 @@ def create_env():
             # check they don't match
             for worker in range(3):
                 for k in state_dict[f"worker{worker}"]["policy_state_dict"]:
-                    with pytest.raises(AssertionError):
+                    with pytest.raises(
+                        AssertionError
+                    ) if torch.cuda.is_available() else nullcontext():
                         torch.testing.assert_close(
                             state_dict[f"worker{worker}"]["policy_state_dict"][k],
                             policy_state_dict[k].cpu(),
@@ -2401,7 +2408,9 @@ def test_auto_wrap_error(self, collector_class, env_maker, num_envs):
         policy = UnwrappablePolicy(out_features=env_maker().action_spec.shape[-1])
         with pytest.raises(
             TypeError,
-            match=("Arguments to policy.forward are incompatible with entries in"),
+            match=(
+                "Arguments to policy.forward are incompatible with entries in|Failed to wrap the policy. If the policy needs to be trusted, set trust_policy=True."
+            ),
         ):
             collector_class(
                 **self._create_collector_kwargs(
@@ -2980,6 +2989,94 @@ def test_param_sync_mixed_device(
             col.shutdown()
             del col
 
+    @pytest.mark.skipif(
+        not torch.cuda.is_available() or torch.cuda.device_count() < 3,
+        reason="requires at least 3 CUDA devices",
+    )
+    @pytest.mark.parametrize(
+        "weight_sync_scheme",
+        [SharedMemWeightSyncScheme, MultiProcessWeightSyncScheme],
+    )
+    def test_shared_device_weight_update(self, weight_sync_scheme):
+        """Test that weight updates work correctly when multiple workers share the same device.
+
+        This test specifically validates the per-worker queue implementation in SharedMemWeightSyncScheme.
+        When workers 0 and 2 share cuda:2, each should receive its own copy of the weights through
+        dedicated queues, preventing race conditions that could occur with a single shared queue.
+        """
+        # Create policy on cuda:0
+        policy = TensorDictModule(
+            nn.Linear(7, 7, device="cuda:0"),
+            in_keys=["observation"],
+            out_keys=["action"],
+        )
+
+        def make_env():
+            return ContinuousActionVecMockEnv()
+
+        # Create collector with workers on cuda:2, cuda:1, cuda:2
+        # Workers 0 and 2 share cuda:2 - this is the key test case
+        collector = MultiaSyncDataCollector(
+            [make_env, make_env, make_env],
+            policy=policy,
+            frames_per_batch=30,
+            total_frames=300,
+            device=["cuda:2", "cuda:1", "cuda:2"],
+            storing_device=["cuda:2", "cuda:1", "cuda:2"],
+            weight_sync_schemes={"policy": weight_sync_scheme()},
+        )
+
+        try:
+            # Collect first batch to initialize workers
+            for _ in collector:
+                break
+
+            # Get initial weights
+            old_weight = policy.module.weight.data.clone()
+
+            # Modify policy weights on cuda:0
+            for p in policy.parameters():
+                p.data += torch.randn_like(p)
+
+            new_weight = policy.module.weight.data.clone()
+            assert not torch.allclose(
+                old_weight, new_weight
+            ), "Weights should have changed"
+
+            # Update weights - this should propagate to all workers via their dedicated queues
+            collector.update_policy_weights_()
+
+            # Collect more batches to ensure weights are propagated
+            for i, _ in enumerate(collector):
+                if i >= 2:
+                    break
+
+            # Get state dict from all workers
+            state_dict = collector.state_dict()
+
+            # Verify all workers have the new weights, including both workers on cuda:2
+            for worker_idx in range(3):
+                worker_key = f"worker{worker_idx}"
+                assert (
+                    "policy_state_dict" in state_dict[worker_key]
+                ), f"Worker {worker_idx} should have policy_state_dict"
+                worker_weight = state_dict[worker_key]["policy_state_dict"][
+                    "module.weight"
+                ]
+                torch.testing.assert_close(
+                    worker_weight.cpu(),
+                    new_weight.cpu(),
+                    msg=(
+                        f"Worker {worker_idx} weights don't match expected weights. "
+                        f"Workers 0 and 2 share device cuda:2, worker 1 is on cuda:1. "
+                        f"This test validates that the per-worker queue system correctly "
+                        f"distributes weights even when multiple workers share a device."
+                    ),
+                )
+        finally:
+            collector.shutdown()
+            del collector
+
 
 class TestAggregateReset:
     def test_aggregate_reset_to_root(self):
@@ -3176,11 +3273,11 @@ class TestLibThreading:
         reason="setting different threads across workers can randomly fail on OSX.",
     )
     def test_num_threads(self):
-        from torchrl.collectors import collectors
+        pass
 
-        _main_async_collector_saved = collectors._main_async_collector
-        collectors._main_async_collector = decorate_thread_sub_func(
-            collectors._main_async_collector, num_threads=3
+        _main_async_collector_saved = torchrl.collectors._runner._main_async_collector
+        torchrl.collectors._runner._main_async_collector = decorate_thread_sub_func(
+            torchrl.collectors._runner._main_async_collector, num_threads=3
         )
         num_threads = torch.get_num_threads()
         try:
@@ -3204,7 +3301,9 @@ def test_num_threads(self):
             except Exception:
                 torchrl_logger.info("Failed to shut down collector")
             # reset vals
-            collectors._main_async_collector = _main_async_collector_saved
+            torchrl.collectors._runner._main_async_collector = (
+                _main_async_collector_saved
+            )
             torch.set_num_threads(num_threads)
 
     @pytest.mark.skipif(
 
@@ -5,13 +5,12 @@
 
 from torchrl.envs.utils import RandomPolicy
 
-from .collectors import (
-    aSyncDataCollector,
-    DataCollectorBase,
-    MultiaSyncDataCollector,
-    MultiSyncDataCollector,
-    SyncDataCollector,
-)
+from ._multi_async import MultiaSyncDataCollector
+from ._multi_sync import MultiSyncDataCollector
+from ._single import SyncDataCollector
+
+from ._single_async import aSyncDataCollector
+from .base import DataCollectorBase
 from .weight_update import (
     MultiProcessedWeightUpdater,
     RayWeightUpdater,
 
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Constants and helper classes for collectors."""
+from __future__ import annotations
+
+import os
+import sys
+from multiprocessing.managers import SyncManager
+
+import torch
+from torch import multiprocessing as mp
+
+from torchrl.envs.utils import ExplorationType
+
+try:
+    from torch.compiler import cudagraph_mark_step_begin
+except ImportError:
+
+    def cudagraph_mark_step_begin():
+        """Placeholder for missing cudagraph_mark_step_begin method."""
+        raise NotImplementedError("cudagraph_mark_step_begin not implemented.")
+
+
+__all__ = [
+    "_TIMEOUT",
+    "INSTANTIATE_TIMEOUT",
+    "_MIN_TIMEOUT",
+    "_MAX_IDLE_COUNT",
+    "DEFAULT_EXPLORATION_TYPE",
+    "_is_osx",
+    "_Interruptor",
+    "_InterruptorManager",
+    "cudagraph_mark_step_begin",
+]
+
+_TIMEOUT = 1.0
+INSTANTIATE_TIMEOUT = 20
+_MIN_TIMEOUT = 1e-3  # should be several orders of magnitude inferior wrt time spent collecting a trajectory
+# MAX_IDLE_COUNT is the maximum number of times a Dataloader worker can timeout with his queue.
+_MAX_IDLE_COUNT = int(os.environ.get("MAX_IDLE_COUNT", torch.iinfo(torch.int64).max))
+
+DEFAULT_EXPLORATION_TYPE: ExplorationType = ExplorationType.RANDOM
+
+_is_osx = sys.platform.startswith("darwin")
+
+
+class _Interruptor:
+    """A class for managing the collection state of a process.
+
+    This class provides methods to start and stop collection, and to check
+    whether collection has been stopped. The collection state is protected
+    by a lock to ensure thread-safety.
+    """
+
+    # interrupter vs interruptor: google trends seems to indicate that "or" is more
+    # widely used than "er" even if my IDE complains about that...
+    def __init__(self):
+        self._collect = True
+        self._lock = mp.Lock()
+
+    def start_collection(self):
+        with self._lock:
+            self._collect = True
+
+    def stop_collection(self):
+        with self._lock:
+            self._collect = False
+
+    def collection_stopped(self):
+        with self._lock:
+            return self._collect is False
+
+
+class _InterruptorManager(SyncManager):
+    """A custom SyncManager for managing the collection state of a process.
+
+    This class extends the SyncManager class and allows to share an Interruptor object
+    between processes.
+    """
+
+
+_InterruptorManager.register("_Interruptor", _Interruptor)