@@ -156,11 +156,11 @@ def prepare_config_and_inputs_for_common(self):
156156 # Place image tokens at the beginning
157157 input_ids [:, : self .num_image_tokens ] = config .image_token_id
158158
159- attention_mask = input_ids .ne (self .pad_token_id ). to ( torch_device )
159+ attention_mask = input_ids .ne (self .pad_token_id )
160160
161161 # Create image_sizes as tensor - must match batch size
162162 image_sizes = torch .tensor (
163- [[self .image_size , self .image_size ]] * self .batch_size , dtype = torch .long , device = torch_device
163+ [[self .image_size , self .image_size ]] * self .batch_size , dtype = torch .long
164164 )
165165
166166 inputs_dict = {
@@ -198,11 +198,11 @@ def prepare_config_and_inputs_for_generate(self, batch_size=None):
198198 # Place image tokens at the beginning
199199 input_ids [:, : self .num_image_tokens ] = config .image_token_id
200200
201- attention_mask = input_ids .ne (self .pad_token_id ). to ( torch_device )
201+ attention_mask = input_ids .ne (self .pad_token_id )
202202
203203 # Create image_sizes as tensor - must match batch size
204204 image_sizes = torch .tensor (
205- [[self .image_size , self .image_size ]] * batch_size , dtype = torch .long , device = torch_device
205+ [[self .image_size , self .image_size ]] * batch_size , dtype = torch .long
206206 )
207207
208208 inputs_dict = {
@@ -612,7 +612,7 @@ def test_model_can_generate_without_images(self):
612612 model .eval ()
613613
614614 # Create text-only input
615- input_ids = torch .randint (0 , config .vocab_size - 1 , (1 , 10 ), device = torch_device ) + 1
615+ input_ids = torch .randint (0 , config .text_config . vocab_size - 1 , (1 , 10 ), device = torch_device ) + 1
616616
617617 with torch .no_grad ():
618618 outputs = model .generate (input_ids = input_ids , max_new_tokens = 5 )
@@ -660,7 +660,7 @@ def test_model_forward_with_images(self):
660660 num_image_tokens = num_patches // (config .spatial_merge_size ** 2 )
661661
662662 seq_len = num_image_tokens + 10
663- input_ids = torch .randint (0 , config .vocab_size - 1 , (batch_size , seq_len ), device = torch_device ) + 1
663+ input_ids = torch .randint (0 , config .text_config . vocab_size - 1 , (batch_size , seq_len ), device = torch_device ) + 1
664664 # Ensure no tokens accidentally equal image_token_id
665665 input_ids [input_ids == config .image_token_id ] = config .image_token_id + 1
666666 # Now place image tokens at the beginning
@@ -677,4 +677,4 @@ def test_model_forward_with_images(self):
677677 self .assertIsNotNone (outputs .logits )
678678 self .assertEqual (outputs .logits .shape [0 ], batch_size )
679679 self .assertEqual (outputs .logits .shape [1 ], seq_len )
680- self .assertEqual (outputs .logits .shape [2 ], config .vocab_size )
680+ self .assertEqual (outputs .logits .shape [2 ], config .text_config . vocab_size )
0 commit comments