Skip to content
Merged

Compress #42643

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ def _has_nested_attr(obj, attr_path):
if comp_decomp_obj is not None and hasattr(submodule, "weight"):
if "sparse-only" in uncompressed_model:
self.assertTrue(
torch.equal(submodule.weight, comp_decomp_obj.weight),
torch.equal(
submodule.weight.to(torch_device), comp_decomp_obj.weight.to(torch_device)
),
Comment on lines +83 to +85
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have seen that these 2 could be on different device?

Copy link
Contributor Author

@jiqing-feng jiqing-feng Dec 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I run it on A100 and got the device error:

FAILED tests/quantization/compressed_tensors_integration/test_compressed_models.py::StackCompressedModelTest::test_compressed_uncompressed_model_shapes - RuntimeError: Expected all tensors to be on the same device, but got other is on cuda:1, different from other tensors on cuda:0 (wh...

f"Weight mismatch for module '{name}' in sparse-only model.",
)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import unittest

from transformers import AutoModelForCausalLM, AutoTokenizer, CompressedTensorsConfig
from transformers.testing_utils import backend_empty_cache, require_compressed_tensors, require_torch, torch_device
from transformers.testing_utils import (
backend_empty_cache,
require_compressed_tensors,
require_deterministic_for_xpu,
require_torch,
torch_device,
)
from transformers.utils import is_torch_available


Expand Down Expand Up @@ -47,22 +53,33 @@ def test_config_to_from_dict(self):
self.assertIsInstance(config_from_dict.sparsity_config, SparsityCompressionConfig)

def test_tinyllama_w8a8(self):
expected_out = "<s> Paris is the capital of which country?\n\n**A) 10** Paris is the capital of which country?\n\n**B) 11** Paris is the capital of which country?\n\n**C) 1"
expected_out = [
"<s> Paris is the capital of which country?\n\n**A) 10** Paris is the capital of which country?\n\n**B) 11** Paris is the capital of which country?\n\n**C) 1",
"<s> Paris is the capital of which country?\n\n** 10.** Which country is the capital of which country?\n\n** 11.** Which country is the capital of which country?\n\n** 12.", # XPU
]
self._test_quantized_model(self.tinyllama_w8a8, expected_out)

def test_tinyllama_w4a16(self):
expected_out = "<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
expected_out = [
"<s> Paris is the capital of which country?\nAnswer: Paris is the capital of France.\nQuestion: Which country is the capital of which city?\nAnswer: The capital of the city of New York is New York.\nQuestion: Which"
]
self._test_quantized_model(self.tinyllama_w4a16, expected_out)

def test_tinyllama_w8a16(self):
expected_out = "<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
expected_out = [
"<s> Paris is the capital of which country?\nA. France\nB. Germany\nC. Spain\nD. Italy\nE. Switzerland\nQ10. Which of the following is not a country in the European Union?\nA."
]
self._test_quantized_model(self.tinyllama_w8a16, expected_out)

def test_llama_8b_fp8(self):
expected_out = "<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous bridge in Paris? Pont des Arts\nWhat is the name of the famous opera? "
expected_out = [
"<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous bridge in Paris? Pont des Arts\nWhat is the name of the famous opera? ",
"<|begin_of_text|>Paris is the capital of which country? France\nWhat is the name of the famous art museum in Paris? The Louvre\nWhat is the name of the famous bridge in Paris? Pont des Arts\nWhat is the name of the famous opera", # XPU
]
self._test_quantized_model(self.llama3_8b_fp8, expected_out)

def _test_quantized_model(self, model_name: str, expected_output: str):
@require_deterministic_for_xpu
def _test_quantized_model(self, model_name: str, expected_output: list):
"""Carry out generation"""
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
Expand All @@ -84,4 +101,4 @@ def _test_quantized_model(self, model_name: str, expected_output: str):
outputs = tokenizer.batch_decode(generated_ids)

self.assertIsNotNone(outputs)
self.assertEqual(outputs[0], expected_output)
self.assertIn(outputs[0], expected_output)