v0.1.0 beta2

OKUA1 · OKUA1 · commit 3d4620b15dd8 · 2023-05-19T22:05:38.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -159,3 +159,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 test.py
+tmp.ipynb
diff --git a/README.md b/README.md
@@ -72,6 +72,9 @@ labels = clf.predict(X)
 
 ```
 
+**Note:** unlike in a typical supervised setting, the performance of a zero-shot classifier greatly depends on how the label itself is structured. It has to be expressed in natural language, be descriptive and self-explanatory. For example, in the previous semantic classification task, it could be beneficial to transform a label from `"<semantics>"` to `"the semantics of the provided text is <semantics>"`. 
+
+
 ### Multi-Label Zero-Shot Text Classification
 
 With a class `MultiLabelZeroShotGPTClassifier` it is possible to perform the classification in multi-label setting, which means that each sample might be assigned to one or several distinct classes.
@@ -113,6 +116,34 @@ clf.fit(None, [candidate_labels])
 labels = clf.predict(X)
 ```
 
+### Text Vectorization
+
+As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model.
+
+Example 1: Embedding the text
+```python
+from skllm.preprocessing import GPTVectorizer
+
+model = GPTVectorizer()
+vectors = model.fit_transform(X)
+```
+
+Example 2: Combining the Vectorizer with the XGBoost Classifier in a Sklearn Pipeline
+```python
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+from xgboost import XGBClassifier
+
+le = LabelEncoder()
+y_train_encoded = le.fit_transform(y_train)
+y_test_encoded = le.transform(y_test)
+
+steps = [('GPT', GPTVectorizer()), ('Clf', XGBClassifier())]
+clf = Pipeline(steps)
+clf.fit(X_train, y_train_encoded)
+yh = clf.predict(X_test)
+```
+
 ## Roadmap 🧭
 
 - [x] Zero-Shot Classification with OpenAI GPT 3/4
@@ -121,6 +152,6 @@ labels = clf.predict(X)
     - [x] ChatGPT models
     - [ ] InstructGPT models
 - [ ] Few shot classifier
-- [ ] GPT Vectorizer
+- [x] GPT Vectorizer
 - [ ] GPT Fine-tuning (optional)
 - [ ] Integration of other LLMs
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ dependencies = [
 "tqdm>=4.60.0",
 ]
 name = "scikit-llm"
-version = "0.1.0b1"
+version = "0.1.0b2"
 authors = [
   { name="Oleg Kostromin", email="kostromin97@gmail.com" },
   { name="Iryna Kondrashchenko", email="iryna230520@gmail.com" },
diff --git a/skllm/models/gpt_zero_shot_clf.py b/skllm/models/gpt_zero_shot_clf.py
@@ -13,53 +13,38 @@
     extract_json_key,
 )
 from skllm.config import SKLLMConfig as _Config
+from skllm.utils import to_numpy as _to_numpy
+from skllm.openai.mixin import OpenAIMixin as _OAIMixin
 
-
-class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin):
+class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin):
     def __init__(
         self,
         openai_key: Optional[str] = None,
         openai_org: Optional[str] = None,
         openai_model: str = "gpt-3.5-turbo",
     ):
-        self.openai_key = openai_key
-        self.openai_org = openai_org
+        self._set_keys(openai_key, openai_org)
         self.openai_model = openai_model
 
+    def _to_np(self, X):
+        return _to_numpy(X)
+
     def fit(
         self,
         X: Optional[Union[np.ndarray, pd.Series, List[str]]],
         y: Union[np.ndarray, pd.Series, List[str], List[List[str]]],
     ):
-        if isinstance(X, np.ndarray):
-            X = np.squeeze(X)
+        X = self._to_np(X)        
         self.classes_, self.probabilities_ = self._get_unique_targets(y)
         return self
 
     def predict(self, X: Union[np.ndarray, pd.Series, List[str]]):
-        if isinstance(X, np.ndarray):
-            X = np.squeeze(X)
+        X = self._to_np(X)
         predictions = []
         for i in tqdm(range(len(X))):
             predictions.append(self._predict_single(X[i]))
         return predictions
 
-    def _get_openai_key(self):
-        key = self.openai_key
-        if key is None:
-            key = _Config.get_openai_key()
-        if key is None:
-            raise RuntimeError("OpenAI key was not found")
-        return key
-
-    def _get_openai_org(self):
-        key = self.openai_org
-        if key is None:
-            key = _Config.get_openai_org()
-        if key is None:
-            raise RuntimeError("OpenAI organization was not found")
-        return key
-
     @abstractmethod
     def _extract_labels(self, y: Any) -> List[str]:
         pass
@@ -126,8 +111,7 @@ def fit(
         X: Optional[Union[np.ndarray, pd.Series, List[str]]],
         y: Union[np.ndarray, pd.Series, List[str]],
     ):
-        if isinstance(y, np.ndarray):
-            y = np.squeeze(y)
+        y = self._to_np(y)
         return super().fit(X, y)
 
 
diff --git a/skllm/openai/chatgpt.py b/skllm/openai/chatgpt.py
@@ -1,14 +1,14 @@
 import openai
 import json
+from skllm.openai.credentials import set_credentials
 
 def construct_message(role, content):
     if role not in ("system", "user", "assistant"):
         raise ValueError("Invalid role")
     return {"role": role, "content": content}
 
 def get_chat_completion(messages, key, org, model="gpt-3.5-turbo", max_retries = 3):
-    openai.api_key = key
-    openai.organization = org
+    set_credentials(key, org)
     for _ in range(max_retries):
         try:
             completion = openai.ChatCompletion.create(
diff --git a/skllm/openai/credentials.py b/skllm/openai/credentials.py
@@ -0,0 +1,5 @@
+import openai
+
+def set_credentials(key: str, org: str):
+    openai.api_key = key
+    openai.organization = org
diff --git a/skllm/openai/embeddings.py b/skllm/openai/embeddings.py
@@ -0,0 +1,23 @@
+import openai
+from skllm.openai.credentials import set_credentials
+
+def get_embedding(
+    text, key: str, org: str, model="text-embedding-ada-002", max_retries=3
+):
+    set_credentials(key, org)
+    text = text.replace("\n", " ")
+    error_msg = None
+    for _ in range(max_retries):
+        try:
+            emb = openai.Embedding.create(input=[text], model=model)["data"][0][
+                "embedding"
+            ]
+            if not isinstance(emb, list):
+                raise ValueError(f"Encountered unknown embedding format. Expected list, got {type(emb)}")
+            return emb
+        except Exception as e:
+            error_msg = str(e)
+            continue
+    raise RuntimeError(
+        f"Could not obtain the embedding after retrying {max_retries} times. \nLast captured error: `{error_msg}`"
+    )
diff --git a/skllm/openai/mixin.py b/skllm/openai/mixin.py
@@ -0,0 +1,25 @@
+from typing import Optional
+from skllm.config import SKLLMConfig as _Config
+
+class OpenAIMixin:
+
+    def _set_keys(self, key: Optional[str] = None, org: Optional[str] = None) -> None:
+        self.openai_key = key
+        self.openai_org = org
+
+    def _get_openai_key(self) -> str:
+        key = self.openai_key
+        if key is None:
+            key = _Config.get_openai_key()
+        if key is None:
+            raise RuntimeError("OpenAI key was not found")
+        return key
+
+    def _get_openai_org(self) -> str:
+        key = self.openai_org
+        if key is None:
+            key = _Config.get_openai_org()
+        if key is None:
+            raise RuntimeError("OpenAI organization was not found")
+        return key
+ 
diff --git a/skllm/preprocessing/__init__.py b/skllm/preprocessing/__init__.py
@@ -0,0 +1 @@
+from skllm.preprocessing.gpt_vectorizer import GPTVectorizer
diff --git a/skllm/preprocessing/gpt_vectorizer.py b/skllm/preprocessing/gpt_vectorizer.py
@@ -0,0 +1,39 @@
+from sklearn.base import (
+    BaseEstimator as _BaseEstimator,
+    TransformerMixin as _TransformerMixin,
+)
+from typing import Any, Optional, Union, List
+from tqdm import tqdm
+import numpy as np
+from numpy import ndarray
+import pandas as pd
+from skllm.openai.mixin import OpenAIMixin as _OAIMixin
+from skllm.openai.embeddings import get_embedding as _get_embedding
+from skllm.utils import to_numpy as _to_numpy
+
+
+class GPTVectorizer(_BaseEstimator, _TransformerMixin, _OAIMixin):
+    def __init__(
+        self,
+        openai_embedding_model: str = "text-embedding-ada-002",
+        openai_key: Optional[str] = None,
+        openai_org: Optional[str] = None,
+    ):
+        self.openai_embedding_model = openai_embedding_model
+        self._set_keys(openai_key, openai_org)
+
+    def fit(self, X: Any = None, y: Any = None, **kwargs):
+        return self
+
+    def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> ndarray:
+        X = _to_numpy(X)
+        embeddings = []
+        for i in tqdm(range(len(X))):
+            embeddings.append(
+                _get_embedding(X[i], self._get_openai_key(), self._get_openai_org())
+            )
+        embeddings = np.asarray(embeddings)
+        return embeddings
+
+    def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray:
+        return self.fit(X, y).transform(X)
diff --git a/skllm/utils.py b/skllm/utils.py
@@ -0,0 +1,11 @@
+import numpy as np 
+import pandas as pd
+
+def to_numpy(X):
+    if isinstance(X, pd.Series):
+        X = X.to_numpy()
+    elif isinstance(X, list):
+        X = np.asarray(X)
+    if isinstance(X, np.ndarray):
+        X = np.squeeze(X)
+    return X

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ dependencies = [`
`11`	`11`	`"tqdm>=4.60.0",`
`12`	`12`	`]`
`13`	`13`	`name = "scikit-llm"`
`14`		`-version = "0.1.0b1"`
	`14`	`+version = "0.1.0b2"`
`15`	`15`	`authors = [`
`16`	`16`	`{ name="Oleg Kostromin", email="[email protected]" },`
`17`	`17`	`{ name="Iryna Kondrashchenko", email="[email protected]" },`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from skllm.preprocessing.gpt_vectorizer import GPTVectorizer`