Merge pull request #63 from ashwinprasadme/dev

OKUA1 · web-flow · commit 1c93c891eab2 · 2023-09-13T19:14:05.000+02:00
Added MultiLabelFewShotGPTClassifier and MultiLabel GPT Fine-tuning Support
diff --git a/README.md b/README.md
@@ -200,6 +200,21 @@ While the api remains the same as for the zero shot classifier, there are a few
 
 Note: as the model is not being re-trained, but uses the training data during inference, one could say that this is still a (different) zero-shot approach.
 
+### Multi-Label Few-Shot Text Classification
+
+Example:
+
+```python
+from skllm.models.gpt.gpt_few_shot_clf import MultiLabelFewShotGPTClassifier
+from skllm.datasets import get_multilabel_classification_dataset
+
+X, y = get_multilabel_classification_dataset()
+
+clf = MultiLabelFewShotGPTClassifier(max_labels=2, openai_model="gpt-3.5-turbo")
+clf.fit(X, y)
+labels = clf.predict(X)
+```
+
 ### Dynamic Few-Shot Text Classification
 
 _To use this feature, you need to install `annoy` library:_
@@ -340,7 +355,23 @@ clf.fit(X_train, y_train) # y_train is a list of labels
 labels = clf.predict(X_test)
 ```
 
-Example 4: Fine-tuning a GPT model for text to text tasks
+Example 4: Fine-tuning a GPT model for multi-label text classification
+
+```python
+from skllm.models.gpt import MultiLabelGPTClassifier
+
+clf = MultiLabelGPTClassifier(
+    base_model = "gpt-3.5-turbo-0613",
+    n_epochs = None,  # int or None. When None, will be determined automatically by OpenAI
+    default_label = "Random",  # optional
+    max_labels = 2,
+)
+
+clf.fit(X_train, y_train)
+labels = clf.predict(X_test)
+```
+
+Example 5: Fine-tuning a GPT model for text to text tasks
 
 ```python
 from skllm.models.gpt import GPT
diff --git a/skllm/models/gpt/__init__.py b/skllm/models/gpt/__init__.py
@@ -1,8 +1,11 @@
 from skllm.models.gpt.gpt_dyn_few_shot_clf import DynamicFewShotGPTClassifier
-from skllm.models.gpt.gpt_few_shot_clf import FewShotGPTClassifier
+from skllm.models.gpt.gpt_few_shot_clf import (
+    FewShotGPTClassifier,
+    MultiLabelFewShotGPTClassifier,
+)
 from skllm.models.gpt.gpt_zero_shot_clf import (
     ZeroShotGPTClassifier,
     MultiLabelZeroShotGPTClassifier,
 )
 
-from skllm.models.gpt.gpt import GPTClassifier, GPT
+from skllm.models.gpt.gpt import GPTClassifier, GPT, MultiLabelGPTClassifier
diff --git a/skllm/models/gpt/gpt.py b/skllm/models/gpt/gpt.py
@@ -1,12 +1,26 @@
-from typing import Optional, Union, List
+import json
+import uuid
+from typing import List, Optional, Union
+
+import numpy as np
 import pandas as pd
+
 from skllm.models._base import _BaseZeroShotGPTClassifier
-from skllm.prompts.builders import build_zero_shot_prompt_slc
 from skllm.openai.credentials import set_credentials
-from skllm.openai.tuning import create_tuning_job, await_results, delete_file
-import numpy as np
-import json
-import uuid
+from skllm.openai.tuning import await_results, create_tuning_job, delete_file
+from skllm.prompts.builders import (
+    build_zero_shot_prompt_mlc,
+    build_zero_shot_prompt_slc,
+)
+
+from skllm.utils import extract_json_key
+
+_TRAINING_SAMPLE_PROMPT_TEMPLATE = """
+Sample input:
+```{x}```
+
+Sample target: {label}
+"""
 
 
 def _build_clf_example(
@@ -111,6 +125,126 @@ def fit(
         return self
 
 
+class MultiLabelGPTClassifier(_BaseZeroShotGPTClassifier, _Tunable):
+    """Fine-tunable GPT classifier for multi-label classification."""
+
+    supported_models = ["gpt-3.5-turbo-0613"]
+
+    def __init__(
+        self,
+        base_model: str = "gpt-3.5-turbo-0613",
+        default_label: Optional[str] = "Random",
+        openai_key: Optional[str] = None,
+        openai_org: Optional[str] = None,
+        n_epochs: Optional[int] = None,
+        custom_suffix: Optional[str] = "skllm",
+        max_labels: int = 3,
+    ):
+        self.base_model = base_model
+        self.n_epochs = n_epochs
+        self.custom_suffix = custom_suffix
+        if max_labels < 2:
+            raise ValueError("max_labels should be at least 2")
+        if isinstance(default_label, str) and default_label != "Random":
+            raise ValueError("default_label should be a list of strings or 'Random'")
+        self.max_labels = max_labels
+
+        if base_model not in self.supported_models:
+            raise ValueError(
+                f"Model {base_model} is not supported. Supported models are"
+                f" {self.supported_models}"
+            )
+        super().__init__(
+            openai_model="undefined",
+            default_label=default_label,
+            openai_key=openai_key,
+            openai_org=openai_org,
+        )
+
+    def _get_prompt(self, x: str) -> str:
+        """Generates the prompt for the given input.
+
+        Parameters
+        ----------
+        x : str
+            sample
+
+        Returns
+        -------
+        str
+            final prompt
+        """
+        return build_zero_shot_prompt_mlc(
+            x=x,
+            labels=repr(self.classes_),
+            max_cats=self.max_labels,
+        )
+
+    def _extract_labels(self, y) -> List[str]:
+        """Extracts the labels into a list.
+
+        Parameters
+        ----------
+        y : Any
+
+        Returns
+        -------
+        List[str]
+        """
+        labels = []
+        for l in y:
+            for j in l:
+                labels.append(j)
+        return labels
+
+    def _predict_single(self, x):
+        """Predicts the labels for a single sample."""
+        completion = self._get_chat_completion(x)
+        try:
+            labels = extract_json_key(
+                completion["choices"][0]["message"]["content"], "label"
+            )
+            if not isinstance(labels, list):
+                labels = labels.split(",")
+                labels = [l.strip() for l in labels]
+        except Exception as e:
+            print(completion)
+            print(f"Could not extract the label from the completion: {str(e)}")
+            labels = []
+
+        labels = list(filter(lambda l: l in self.classes_, labels))
+        if len(labels) == 0:
+            labels = self._get_default_label()
+        if labels is not None and len(labels) > self.max_labels:
+            labels = labels[: self.max_labels - 1]
+        return labels
+
+    def fit(
+        self,
+        X: Union[np.ndarray, pd.Series, List[str]],
+        y: List[List[str]],
+    ):
+        """Fits the model to the given data.
+
+        Parameters
+        ----------
+        X : Union[np.ndarray, pd.Series, List[str]]
+            training data
+        y : List[List[str]]
+            training labels
+
+        Returns
+        -------
+        MultiLabelGPTClassifier
+            self
+        """
+        X = self._to_np(X)
+        y = self._to_np(y)
+        super().fit(X, y)
+        self._tune(X, y)
+        return self
+
+
 # similarly to PaLM, this is not a classifier, but a quick way to re-use the code
 # the hierarchy of classes will be reworked in the next releases
 class GPT(_BaseZeroShotGPTClassifier, _Tunable):
diff --git a/skllm/models/gpt/gpt_few_shot_clf.py b/skllm/models/gpt/gpt_few_shot_clf.py
@@ -1,10 +1,11 @@
-from typing import List, Union
+from typing import List, Literal, Optional, Union
 
 import numpy as np
 import pandas as pd
 
 from skllm.models._base import _BaseZeroShotGPTClassifier
-from skllm.prompts.builders import build_few_shot_prompt_slc
+from skllm.prompts.builders import build_few_shot_prompt_mlc, build_few_shot_prompt_slc
+from skllm.utils import extract_json_key
 from skllm.utils import to_numpy as _to_numpy
 
 _TRAINING_SAMPLE_PROMPT_TEMPLATE = """
@@ -69,3 +70,104 @@ def _get_prompt(self, x: str) -> str:
         return build_few_shot_prompt_slc(
             x=x, training_data=training_data_str, labels=repr(self.classes_)
         )
+
+
+class MultiLabelFewShotGPTClassifier(_BaseZeroShotGPTClassifier):
+    """Few-shot multi-label classifier."""
+
+    def __init__(
+        self,
+        openai_key: Optional[str] = None,
+        openai_org: Optional[str] = None,
+        openai_model: str = "gpt-3.5-turbo",
+        default_label: Optional[Union[List[str], Literal["Random"]]] = "Random",
+        max_labels: int = 3,
+    ):
+        super().__init__(openai_key, openai_org, openai_model, default_label)
+        if max_labels < 2:
+            raise ValueError("max_labels should be at least 2")
+        if isinstance(default_label, str) and default_label != "Random":
+            raise ValueError("default_label should be a list of strings or 'Random'")
+        self.max_labels = max_labels
+
+    def _extract_labels(self, y) -> List[str]:
+        """Extracts the labels into a list.
+
+        Parameters
+        ----------
+        y : Any
+
+        Returns
+        -------
+        List[str]
+        """
+        labels = []
+        for l in y:
+            for j in l:
+                labels.append(j)
+        return labels
+
+    def fit(
+        self,
+        X: Union[np.ndarray, pd.Series, List[str]],
+        y: List[List[str]],
+    ):
+        """Fits the model to the given data.
+
+        Parameters
+        ----------
+        X : Union[np.ndarray, pd.Series, List[str]]
+            training data
+        y : Union[np.ndarray, pd.Series, List[str]]
+            training labels
+
+        Returns
+        -------
+        FewShotGPTClassifier
+            self
+        """
+        if not len(X) == len(y):
+            raise ValueError("X and y must have the same length.")
+        X = _to_numpy(X)
+        y = _to_numpy(y)
+        self.training_data_ = (X, y)
+        self.classes_, self.probabilities_ = self._get_unique_targets(y)
+        return self
+
+    def _get_prompt(self, x) -> str:
+        training_data = []
+        for xt, yt in zip(*self.training_data_):
+            training_data.append(
+                _TRAINING_SAMPLE_PROMPT_TEMPLATE.format(x=xt, label=yt)
+            )
+
+        training_data_str = "\n".join(training_data)
+
+        return build_few_shot_prompt_mlc(
+            x=x,
+            training_data=training_data_str,
+            labels=repr(self.classes_),
+            max_cats=self.max_labels,
+        )
+
+    def _predict_single(self, x):
+        """Predicts the labels for a single sample."""
+        completion = self._get_chat_completion(x)
+        try:
+            labels = extract_json_key(
+                completion["choices"][0]["message"]["content"], "label"
+            )
+            if not isinstance(labels, list):
+                labels = labels.split(",")
+                labels = [l.strip() for l in labels]
+        except Exception as e:
+            print(completion)
+            print(f"Could not extract the label from the completion: {str(e)}")
+            labels = []
+
+        labels = list(filter(lambda l: l in self.classes_, labels))
+        if len(labels) == 0:
+            labels = self._get_default_label()
+        if labels is not None and len(labels) > self.max_labels:
+            labels = labels[: self.max_labels - 1]
+        return labels
diff --git a/skllm/prompts/builders.py b/skllm/prompts/builders.py
@@ -2,6 +2,7 @@
 
 from skllm.prompts.templates import (
     FEW_SHOT_CLF_PROMPT_TEMPLATE,
+    FEW_SHOT_MLCLF_PROMPT_TEMPLATE,
     FOCUSED_SUMMARY_PROMPT_TEMPLATE,
     SUMMARY_PROMPT_TEMPLATE,
     TRANSLATION_PROMPT_TEMPLATE,
@@ -61,6 +62,38 @@ def build_few_shot_prompt_slc(
     return template.format(x=x, labels=labels, training_data=training_data)
 
 
+def build_few_shot_prompt_mlc(
+    x: str,
+    labels: str,
+    training_data: str,
+    max_cats: Union[int, str],
+    template: str = FEW_SHOT_MLCLF_PROMPT_TEMPLATE,
+) -> str:
+    """Builds a prompt for few-shot single-label classification.
+
+    Parameters
+    ----------
+    x : str
+        sample to classify
+    labels : str
+        candidate labels in a list-like representation
+    max_cats : Union[int,str]
+        maximum number of categories to assign
+    training_data : str
+        training data to be used for few-shot learning
+    template : str
+        prompt template to use, must contain placeholders for all variables, by default ZERO_SHOT_CLF_PROMPT_TEMPLATE
+
+    Returns
+    -------
+    str
+        prepared prompt
+    """
+    return template.format(
+        x=x, labels=labels, training_data=training_data, max_cats=max_cats
+    )
+
+
 def build_zero_shot_prompt_mlc(
     x: str,
     labels: str,
diff --git a/skllm/prompts/templates.py b/skllm/prompts/templates.py