added readme + minor code refactoring + demo data

OKUA1 · OKUA1 · commit 157548dcee01 · 2023-05-13T19:55:54.000+02:00
diff --git a/README.md b/README.md
@@ -1 +1,116 @@
-# scikit-llm
+<p align="center">
+  <img src="https://github.com/iryna-kondr/scikit-llm/blob/main/logo.png?raw=true" height="200"/>
+</p>
+
+# Scikit-LLM: Sklearn Meets Large Language Models
+
+Seamlessly integrate powerful language models like ChatGPT into scikit-learn for enhanced text analysis tasks.
+
+## Installation 💾 
+
+```bash 
+pip install scikit-llm
+```
+
+## Documentation 📚
+
+### Configuring OpenAI API Key
+At the moment Scikit-LLM is only compatible with some of the OpenAI models. Hence, a user-provided OpenAI API key is required.
+
+```python
+from skllm.config import SKLLMConfig
+SKLLMConfig.set_openai_key("<YOUR_KEY>")
+SKLLMConfig.set_openai_org("<YOUR_ORGANISATION>")
+```
+
+### Zero-Shot Text Classification
+
+One of the powerful ChatGPT features is the ability to perform text classification without being re-trained. For that, the only requirement is that the labels must be descriptive.
+
+We provide a class `ZeroShotGPTClassifier` that allows to create such a model as a regular scikit-learn classifier.
+
+Example 1: Training as a regular classifier
+```python
+from skllm import ZeroShotGPTClassifier
+from skllm.datasets import get_classification_dataset
+
+# demo sentiment analysis dataset
+# labels: positive, negative, neutral
+X, y = get_classification_dataset() 
+
+clf = ZeroShotGPTClassifier(openai_model = "gpt-3.5-turbo")
+clf.fit(X, y)
+labels = clf.predict(X)
+```
+Scikit-LLM will automatically query the OpenAI API and transform the response into a regular list of labels.
+
+Additionally, Scikit-LLM will ensure that the obtained response contains a valid label. If this is not the case, a label will be selected randomly (label probabilities are proportional to label occurrences in the training set).
+
+Example 2: Training without labeled data
+
+Since the training data is not strictly required, it can be fully ommited. The only thing that has to be provided is the list of candidate labels.
+
+```python
+from skllm import ZeroShotGPTClassifier
+from skllm.datasets import get_classification_dataset
+
+X, _ = get_classification_dataset()
+
+clf = ZeroShotGPTClassifier()
+clf.fit(None, ['positive', 'negative', 'neutral'])
+labels = clf.predict(X)
+
+```
+
+### Multi-Label Zero-Shot Text Classification
+
+With a class `MultiLabelZeroShotGPTClassifier` it is possible to perform the classification in multi-label setting, which means that each sample might be assigned to one or several distinct classes.
+
+Example: 
+
+```python
+from skllm import MultiLabelZeroShotGPTClassifier
+from skllm.datasets import get_multilabel_classification_dataset
+
+X, y = get_multilabel_classification_dataset()
+
+clf = MultiLabelZeroShotGPTClassifier(max_labels=3)
+clf.fit(X, y)
+labels = clf.predict(X)
+```
+
+Similarly to the `ZeroShotGPTClassifier` it is sufficient if only candidate labels are provided. However, this time the classifier expects `y` of a type `List[List[str]]`.
+
+```
+from skllm import MultiLabelZeroShotGPTClassifier
+from skllm.datasets import get_multilabel_classification_dataset
+
+X, _ = get_multilabel_classification_dataset()
+candidate_labels = [
+    "Quality", 
+    "Price", 
+    "Delivery", 
+    "Service", 
+    "Product Variety", 
+    "Customer Support", 
+    "Packaging", 
+    "User Experience", 
+    "Return Policy", 
+    "Product Information"
+]
+clf = MultiLabelZeroShotGPTClassifier(max_labels=3)
+clf.fit(None, [candidate_labels])
+labels = clf.predict(X)
+```
+
+## Roadmap 🧭
+
+- [x] Zero-Shot Classification with OpenAI GPT 3/4
+    - [x] Multiclass classification
+    - [x] Multi-label classification
+    - [x] ChatGPT models
+    - [ ] InstructGPT models
+- [ ] Few shot classifier
+- [ ] GPT Vectorizer
+- [ ] GPT Fine-tuning (optional)
+- [ ] Integration of other LLMs
diff --git a/skllm/config.py b/skllm/config.py
@@ -1,14 +1,23 @@
 import os
 from typing import Optional
 
-_OPENAI_KEY_VAR = 'SLLM_CONFIG_OPENAI_KEY'
+_OPENAI_KEY_VAR = "SKLLM_CONFIG_OPENAI_KEY"
+_OPENAI_ORG_VAR = "SKLLM_CONFIG_OPENAI_ORG"
 
-class SLLMConfig():
+class SKLLMConfig():
     
     @staticmethod
     def set_openai_key(key: str) -> None:
         os.environ[_OPENAI_KEY_VAR] = key
 
     @staticmethod
     def get_openai_key() -> Optional[str]:
-        return os.environ.get(_OPENAI_KEY_VAR, None)
+        return os.environ.get(_OPENAI_KEY_VAR, None)
+    
+    @staticmethod
+    def set_openai_org(key: str) -> None:
+        os.environ[_OPENAI_ORG_VAR] = key
+
+    @staticmethod
+    def get_openai_org() -> Optional[str]:
+        return os.environ.get(_OPENAI_ORG_VAR, None)
diff --git a/skllm/datasets/__init__.py b/skllm/datasets/__init__.py
@@ -0,0 +1,2 @@
+from skllm.datasets.multi_class import get_classification_dataset
+from skllm.datasets.multi_label import get_multilabel_classification_dataset
diff --git a/skllm/datasets/multi_class.py b/skllm/datasets/multi_class.py
@@ -0,0 +1,42 @@
+def get_classification_dataset():
+    X = [
+        r"I was absolutely blown away by the performances in 'Summer's End'. The acting was top-notch, and the plot had me gripped from start to finish. A truly captivating cinematic experience that I would highly recommend.",
+        r"The special effects in 'Star Battles: Nebula Conflict' were out of this world. I felt like I was actually in space. The storyline was incredibly engaging and left me wanting more. Excellent film.",
+        r"'The Lost Symphony' was a masterclass in character development and storytelling. The score was hauntingly beautiful and complimented the intense, emotional scenes perfectly. Kudos to the director and cast for creating such a masterpiece.",
+        r"I was pleasantly surprised by 'Love in the Time of Cholera'. The romantic storyline was heartwarming and the characters were incredibly realistic. The cinematography was also top-notch. A must-watch for all romance lovers.",
+        r"I went into 'Marble Street' with low expectations, but I was pleasantly surprised. The suspense was well-maintained throughout, and the twist at the end was something I did not see coming. Bravo!",
+        r"'The Great Plains' is a touching portrayal of life in rural America. The performances were heartfelt and the scenery was breathtaking. I was moved to tears by the end. It's a story that will stay with me for a long time.",
+        r"The screenwriting in 'Under the Willow Tree' was superb. The dialogue felt real and the characters were well-rounded. The performances were also fantastic. I haven't enjoyed a movie this much in a while.",
+        r"'Nightshade' is a brilliant take on the superhero genre. The protagonist was relatable and the villain was genuinely scary. The action sequences were thrilling and the storyline was engaging. I can't wait for the sequel.",
+        r"The cinematography in 'Awakening' was nothing short of spectacular. The visuals alone are worth the ticket price. The storyline was unique and the performances were solid. An overall fantastic film.",
+        r"'Eternal Embers' was a cinematic delight. The storytelling was original and the performances were exceptional. The director's vision was truly brought to life on the big screen. A must-see for all movie lovers.",
+        r"I was thoroughly disappointed with 'Silver Shadows'. The plot was confusing and the performances were lackluster. I wouldn't recommend wasting your time on this one.",
+        r"'The Darkened Path' was a disaster. The storyline was unoriginal, the acting was wooden and the special effects were laughably bad. Save your money and skip this one.",
+        r"I had high hopes for 'The Final Frontier', but it failed to deliver. The plot was full of holes and the characters were poorly developed. It was a disappointing experience.",
+        r"'The Fall of the Phoenix' was a letdown. The storyline was confusing and the characters were one-dimensional. I found myself checking my watch multiple times throughout the movie.",
+        r"I regret wasting my time on 'Emerald City'. The plot was nonsensical and the performances were uninspired. It was a major disappointment.",
+        r"I found 'Hollow Echoes' to be a complete mess. The plot was non-existent, the performances were overdone, and the pacing was all over the place. Definitely not worth the hype.",
+        r"'Underneath the Stars' was a huge disappointment. The storyline was predictable and the acting was mediocre at best. I was expecting so much more.",
+        r"I was left unimpressed by 'River's Edge'. The plot was convoluted, the characters were uninteresting, and the ending was unsatisfying. It's a pass for me.",
+        r"The acting in 'Desert Mirage' was subpar, and the plot was boring. I found myself yawning multiple times throughout the movie. Save your time and skip this one.",
+        r"'Crimson Dawn' was a major letdown. The plot was cliched and the characters were flat. The special effects were also poorly executed. I wouldn't recommend it.",
+        r"'Remember the Days' was utterly forgettable. The storyline was dull, the performances were bland, and the dialogue was cringeworthy. A big disappointment.",
+        r"'The Last Frontier' was simply okay. The plot was decent and the performances were acceptable. However, it lacked a certain spark to make it truly memorable.",
+        r"'Through the Storm' was not bad, but it wasn't great either. The storyline was somewhat predictable, and the characters were somewhat stereotypical. It was an average movie at best.",
+        r"I found 'After the Rain' to be pretty average. The plot was okay and the performances were decent, but it didn't leave a lasting impression on me.",
+        r"'Beyond the Horizon' was neither good nor bad. The plot was interesting enough, but the characters were not very well developed. It was an okay watch.",
+        r"'The Silent Echo' was a mediocre movie. The storyline was passable and the performances were fair, but it didn't stand out in any way.",
+        r"I thought 'The Scent of Roses' was pretty average. The plot was somewhat engaging, and the performances were okay, but it didn't live up to my expectations.",
+        r"'Under the Same Sky' was an okay movie. The plot was decent, and the performances were fine, but it lacked depth and originality. It's not a movie I would watch again.",
+        r"'Chasing Shadows' was fairly average. The plot was not bad, and the performances were passable, but it lacked a certain spark. It was just okay.",
+        r"'Beneath the Surface' was pretty run-of-the-mill. The plot was decent, the performances were okay, but it wasn't particularly memorable. It was an okay movie.",
+    ]
+
+
+    y = (
+        ["positive" for _ in range(10)]
+        + ["negative" for _ in range(10)]
+        + ["neutral" for _ in range(10)]
+    )
+
+    return X, y
diff --git a/skllm/datasets/multi_label.py b/skllm/datasets/multi_label.py
@@ -0,0 +1,28 @@
+def get_multilabel_classification_dataset():
+    X = [
+    "The product was of excellent quality, and the packaging was also very good. Highly recommend!",
+    "The delivery was super fast, but the product did not match the information provided on the website.",
+    "Great variety of products, but the customer support was quite unresponsive.",
+    "Affordable prices and an easy-to-use website. A great shopping experience overall.",
+    "The delivery was delayed, and the packaging was damaged. Not a good experience.",
+    "Excellent customer support, but the return policy is quite complicated.",
+    "The product was not as described. However, the return process was easy and quick.",
+    "Great service and fast delivery. The product was also of high quality.",
+    "The prices are a bit high. However, the product quality and user experience are worth it.",
+    "The website provides detailed information about products. The delivery was also very fast."
+    ]
+
+    y = [
+        ["Quality", "Packaging"],
+        ["Delivery", "Product Information"],
+        ["Product Variety", "Customer Support"],
+        ["Price", "User Experience"],
+        ["Delivery", "Packaging"],
+        ["Customer Support", "Return Policy"],
+        ["Product Information", "Return Policy"],
+        ["Service", "Delivery", "Quality"],
+        ["Price", "Quality", "User Experience"],
+        ["Product Information", "Delivery"],
+    ]
+
+    return X, y
diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py
@@ -7,7 +7,12 @@
 from abc import ABC, abstractmethod
 from sklearn.base import BaseEstimator, ClassifierMixin
 from skllm.openai.prompts import get_zero_shot_prompt_slc, get_zero_shot_prompt_mlc
-from skllm.openai.chatgpt import construct_message, get_chat_completion, extract_json_key
+from skllm.openai.chatgpt import (
+    construct_message,
+    get_chat_completion,
+    extract_json_key,
+)
+from skllm.config import SKLLMConfig as _Config
 
 
 class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin):
@@ -23,20 +28,37 @@ def __init__(
 
     def fit(
         self,
-        X: Union[np.ndarray, pd.Series, List[str]],
+        X: Optional[Union[np.ndarray, pd.Series, List[str]]],
         y: Union[np.ndarray, pd.Series, List[str], List[List[str]]],
     ):
+        if isinstance(X, np.ndarray):
+            X = np.squeeze(X)
         self.classes_, self.probabilities_ = self._get_unique_targets(y)
         return self
 
-    def predict(self, X):
+    def predict(self, X: Union[np.ndarray, pd.Series, List[str]]):
+        if isinstance(X, np.ndarray):
+            X = np.squeeze(X)
         predictions = []
         for i in tqdm(range(len(X))):
             predictions.append(self._predict_single(X[i]))
         return predictions
 
-    def _get_openai_keys(self):
-        return self.openai_key, self.openai_org
+    def _get_openai_key(self):
+        key = self.openai_key
+        if key is None:
+            key = _Config.get_openai_key()
+        if key is None:
+            raise RuntimeError("OpenAI key was not found")
+        return key
+
+    def _get_openai_org(self):
+        key = self.openai_org
+        if key is None:
+            key = _Config.get_openai_org()
+        if key is None:
+            raise RuntimeError("OpenAI organization was not found")
+        return key
 
     @abstractmethod
     def _extract_labels(self, y: Any) -> List[str]:
@@ -56,13 +78,13 @@ def _get_unique_targets(self, y):
 
         return classes, probs
 
-    def _get_completion(self, x):
+    def _get_chat_completion(self, x):
         prompt = self._get_prompt(x)
         msgs = []
         msgs.append(construct_message("system", "You are a text classification model."))
         msgs.append(construct_message("user", prompt))
         completion = get_chat_completion(
-            msgs, self.openai_key, self.openai_org, self.openai_model
+            msgs, self._get_openai_key(), self._get_openai_org(), self.openai_model
         )
         return completion
 
@@ -87,7 +109,7 @@ def _get_prompt(self, x) -> str:
         return get_zero_shot_prompt_slc(x, self.classes_)
 
     def _predict_single(self, x):
-        completion = self._get_completion(x)
+        completion = self._get_chat_completion(x)
         try:
             label = str(
                 extract_json_key(completion.choices[0].message["content"], "label")
@@ -99,6 +121,15 @@ def _predict_single(self, x):
             label = random.choices(self.classes_, self.probabilities_)[0]
         return label
 
+    def fit(
+        self,
+        X: Optional[Union[np.ndarray, pd.Series, List[str]]],
+        y: Union[np.ndarray, pd.Series, List[str]],
+    ):
+        if isinstance(y, np.ndarray):
+            y = np.squeeze(y)
+        return super().fit(X, y)
+
 
 class MultiLabelZeroShotGPTClassifier(_BaseZeroShotGPTClassifier):
     def __init__(
@@ -125,7 +156,7 @@ def _get_prompt(self, x) -> str:
         return get_zero_shot_prompt_mlc(x, self.classes_, self.max_labels)
 
     def _predict_single(self, x):
-        completion = self._get_completion(x)
+        completion = self._get_chat_completion(x)
         try:
             labels = extract_json_key(completion.choices[0].message["content"], "label")
             if not isinstance(labels, list):
@@ -139,4 +170,11 @@ def _predict_single(self, x):
             labels = labels[: self.max_labels - 1]
         elif len(labels) < 1:
             labels = [random.choices(self.classes_, self.probabilities_)[0]]
-        return labels
+        return labels
+
+    def fit(
+        self,
+        X: Optional[Union[np.ndarray, pd.Series, List[str]]],
+        y: List[List[str]],
+    ):
+        return super().fit(X, y)
diff --git a/skllm/openai/chatgpt.py b/skllm/openai/chatgpt.py
@@ -6,14 +6,17 @@ def construct_message(role, content):
         raise ValueError("Invalid role")
     return {"role": role, "content": content}
 
-def get_chat_completion(messages, key, org, model="gpt-3.5-turbo"):
+def get_chat_completion(messages, key, org, model="gpt-3.5-turbo", max_retries = 3):
     openai.api_key = key
     openai.organization = org
-    completion = openai.ChatCompletion.create(
-        model=model, temperature=0., messages=messages
-    )
-
-    return completion
+    for _ in range(max_retries):
+        try:
+            completion = openai.ChatCompletion.create(
+                model=model, temperature=0., messages=messages
+            )
+            return completion
+        except Exception:
+            continue
 
 def extract_json_key(json_, key):
     try: 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from skllm.datasets.multi_class import get_classification_dataset`
	`2`	`+from skllm.datasets.multi_label import get_multilabel_classification_dataset`