Add back support for old Falcon model type/config (#243)

danieldk · danieldk · commit 7ebd7f2a27f1 · 2023-07-14T09:35:32.000+02:00
Two days ago Falcon changed their model type and configuration keys. Yesterday night they reverted the changes. Update Falcon loading to support both old-style and new-style models. Fixes #242.
diff --git a/curated_transformers/models/auto_model.py b/curated_transformers/models/auto_model.py
@@ -116,6 +116,8 @@ class AutoDecoder(AutoModel[DecoderModule]):
         "gpt_neox": GPTNeoXDecoder,
         "llama": LLaMADecoder,
         "falcon": FalconDecoder,
+        "RefinedWeb": FalconDecoder,
+        "RefinedWebModel": FalconDecoder,
     }
 
     @classmethod
@@ -143,6 +145,8 @@ class AutoCausalLM(AutoModel[CausalLMModule[KeyValueCache]]):
         "gpt_neox": GPTNeoXCausalLM,
         "llama": LLaMACausalLM,
         "falcon": FalconCausalLM,
+        "RefinedWeb": FalconCausalLM,
+        "RefinedWebModel": FalconCausalLM,
     }
 
     @classmethod
diff --git a/curated_transformers/models/falcon/_hf.py b/curated_transformers/models/falcon/_hf.py
@@ -12,20 +12,26 @@
 EXTRA_KWARG_KEYS = [ATTENTION_DROPOUT, HIDDEN_DROPOUT]
 
 
+# There are multiple versions of Falcon with different names
+# for the same options.
+HF_CONFIG_KEY_MAPPING_WITH_COMPAT = {
+    frozenset({"num_attention_heads", "n_head"}): "num_attention_heads",
+    frozenset({"num_hidden_layers", "n_layer"}): "num_hidden_layers",
+}
+
 HF_CONFIG_KEY_MAPPING = {
     "hidden_size": "hidden_width",
     "layer_norm_epsilon": "layer_norm_eps",
     "multi_query": "multi_query",
-    "num_attention_heads": "num_attention_heads",
-    "num_hidden_layers": "num_hidden_layers",
     "bias": "use_bias",
     "vocab_size": "vocab_size",
 }
 
 
 def convert_hf_config(hf_config: Any) -> FalconConfig:
+    hf_config_keys = set(hf_config.keys())
     missing_keys = tuple(
-        sorted(set(HF_CONFIG_KEY_MAPPING.keys()).difference(set(hf_config.keys())))
+        sorted(set(HF_CONFIG_KEY_MAPPING.keys()).difference(hf_config_keys))
     )
     if len(missing_keys) != 0:
         raise ValueError(f"Missing keys in Hugging Face Falcon config: {missing_keys}")
@@ -34,6 +40,16 @@ def convert_hf_config(hf_config: Any) -> FalconConfig:
     # Handle config options that are not set in all models.
     kwargs.update({k: hf_config[k] for k in EXTRA_KWARG_KEYS if k in hf_config})
 
+    for hf_keys, curated in HF_CONFIG_KEY_MAPPING_WITH_COMPAT.items():
+        key_overlap = list(hf_keys.intersection(hf_config_keys))
+        if not key_overlap:
+            raise ValueError(
+                f"Hugging Face Falcon config must contain one of: {', '.join(sorted(hf_keys))}"
+            )
+        # Ideally, we'd check that we only have one overlapping key, but
+        # I bet that someone will then add both keys 'just to be sure'.
+        kwargs[curated] = hf_config[key_overlap[0]]
+
     parallel_attention = hf_config.get("parallel_attn", True)
 
     # When new_decoder_architecture is set, the multi_query and parallel_attn
diff --git a/curated_transformers/tests/models/falcon/test_decoder.py b/curated_transformers/tests/models/falcon/test_decoder.py
@@ -25,9 +25,16 @@
 # against output without caching.
 
 
+FALCON_TEST_MODELS = [
+    "explosion-testing/falcon-test",
+    "explosion-testing/refined-web-model-test",
+]
+
+
 @pytest.mark.skipif(not has_hf_transformers, reason="requires huggingface transformers")
 @pytest.mark.parametrize("torch_device", TORCH_DEVICES)
-def test_decoder(torch_device):
+@pytest.mark.parametrize("model", FALCON_TEST_MODELS)
+def test_decoder(torch_device, model):
     hf_model = transformers.AutoModel.from_pretrained(
         "explosion-testing/falcon-test",
         # Safe because it is under our control.
@@ -38,9 +45,7 @@ def test_decoder(torch_device):
     hf_model.to(torch_device)
     hf_model.eval()
 
-    model = FalconDecoder.from_hf_hub(
-        name="explosion-testing/falcon-test", device=torch_device
-    )
+    model = FalconDecoder.from_hf_hub(name=model, device=torch_device)
     model.eval()
 
     torch.manual_seed(0)
@@ -55,10 +60,9 @@ def test_decoder(torch_device):
 
 @pytest.mark.skipif(not has_hf_transformers, reason="requires huggingface transformers")
 @pytest.mark.parametrize("torch_device", TORCH_DEVICES)
-def test_decoder_with_cache(torch_device):
-    model = FalconDecoder.from_hf_hub(
-        name="explosion-testing/falcon-test", device=torch_device
-    )
+@pytest.mark.parametrize("model", FALCON_TEST_MODELS)
+def test_decoder_with_cache(torch_device, model):
+    model = FalconDecoder.from_hf_hub(name=model, device=torch_device)
     model.eval()
 
     torch.manual_seed(0)