fix(lightonocr): fix test failures for vocab_size access and device placement #0

baptiste-aubertin · baptiste-aubertin · commit 4f15b881a212 · 2025-11-21T10:31:58.000+01:00
- Use config.text_config.vocab_size instead of config.vocab_size for composite config
- Remove explicit device placement from attention_mask and image_sizes tensors
- Allow device_map='auto' to handle device placement in model parallelism tests
diff --git a/tests/models/lightonocr/test_modeling_lightonocr.py b/tests/models/lightonocr/test_modeling_lightonocr.py
@@ -156,11 +156,11 @@ def prepare_config_and_inputs_for_common(self):
         # Place image tokens at the beginning
         input_ids[:, : self.num_image_tokens] = config.image_token_id
 
-        attention_mask = input_ids.ne(self.pad_token_id).to(torch_device)
+        attention_mask = input_ids.ne(self.pad_token_id)
 
         # Create image_sizes as tensor - must match batch size
         image_sizes = torch.tensor(
-            [[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long, device=torch_device
+            [[self.image_size, self.image_size]] * self.batch_size, dtype=torch.long
         )
 
         inputs_dict = {
@@ -198,11 +198,11 @@ def prepare_config_and_inputs_for_generate(self, batch_size=None):
         # Place image tokens at the beginning
         input_ids[:, : self.num_image_tokens] = config.image_token_id
 
-        attention_mask = input_ids.ne(self.pad_token_id).to(torch_device)
+        attention_mask = input_ids.ne(self.pad_token_id)
 
         # Create image_sizes as tensor - must match batch size
         image_sizes = torch.tensor(
-            [[self.image_size, self.image_size]] * batch_size, dtype=torch.long, device=torch_device
+            [[self.image_size, self.image_size]] * batch_size, dtype=torch.long
         )
 
         inputs_dict = {
@@ -612,7 +612,7 @@ def test_model_can_generate_without_images(self):
         model.eval()
 
         # Create text-only input
-        input_ids = torch.randint(0, config.vocab_size - 1, (1, 10), device=torch_device) + 1
+        input_ids = torch.randint(0, config.text_config.vocab_size - 1, (1, 10), device=torch_device) + 1
 
         with torch.no_grad():
             outputs = model.generate(input_ids=input_ids, max_new_tokens=5)
@@ -660,7 +660,7 @@ def test_model_forward_with_images(self):
         num_image_tokens = num_patches // (config.spatial_merge_size**2)
 
         seq_len = num_image_tokens + 10
-        input_ids = torch.randint(0, config.vocab_size - 1, (batch_size, seq_len), device=torch_device) + 1
+        input_ids = torch.randint(0, config.text_config.vocab_size - 1, (batch_size, seq_len), device=torch_device) + 1
         # Ensure no tokens accidentally equal image_token_id
         input_ids[input_ids == config.image_token_id] = config.image_token_id + 1
         # Now place image tokens at the beginning
@@ -677,4 +677,4 @@ def test_model_forward_with_images(self):
         self.assertIsNotNone(outputs.logits)
         self.assertEqual(outputs.logits.shape[0], batch_size)
         self.assertEqual(outputs.logits.shape[1], seq_len)
-        self.assertEqual(outputs.logits.shape[2], config.vocab_size)
+        self.assertEqual(outputs.logits.shape[2], config.text_config.vocab_size)