Skip to content

Commit 3d4620b

Browse files
committed
v0.1.0 beta2
1 parent 37fd03d commit 3d4620b

File tree

11 files changed

+150
-30
lines changed

11 files changed

+150
-30
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,4 @@ cython_debug/
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
#.idea/
161161
test.py
162+
tmp.ipynb

README.md

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ labels = clf.predict(X)
7272

7373
```
7474

75+
**Note:** unlike in a typical supervised setting, the performance of a zero-shot classifier greatly depends on how the label itself is structured. It has to be expressed in natural language, be descriptive and self-explanatory. For example, in the previous semantic classification task, it could be beneficial to transform a label from `"<semantics>"` to `"the semantics of the provided text is <semantics>"`.
76+
77+
7578
### Multi-Label Zero-Shot Text Classification
7679

7780
With a class `MultiLabelZeroShotGPTClassifier` it is possible to perform the classification in multi-label setting, which means that each sample might be assigned to one or several distinct classes.
@@ -113,6 +116,34 @@ clf.fit(None, [candidate_labels])
113116
labels = clf.predict(X)
114117
```
115118

119+
### Text Vectorization
120+
121+
As an alternative to using GPT as a classifier, it can be used solely for data preprocessing. `GPTVectorizer` allows to embed a chunk of text of arbitrary length to a fixed-dimensional vector, that can be used with virtually any classification or regression model.
122+
123+
Example 1: Embedding the text
124+
```python
125+
from skllm.preprocessing import GPTVectorizer
126+
127+
model = GPTVectorizer()
128+
vectors = model.fit_transform(X)
129+
```
130+
131+
Example 2: Combining the Vectorizer with the XGBoost Classifier in a Sklearn Pipeline
132+
```python
133+
from sklearn.pipeline import Pipeline
134+
from sklearn.preprocessing import LabelEncoder
135+
from xgboost import XGBClassifier
136+
137+
le = LabelEncoder()
138+
y_train_encoded = le.fit_transform(y_train)
139+
y_test_encoded = le.transform(y_test)
140+
141+
steps = [('GPT', GPTVectorizer()), ('Clf', XGBClassifier())]
142+
clf = Pipeline(steps)
143+
clf.fit(X_train, y_train_encoded)
144+
yh = clf.predict(X_test)
145+
```
146+
116147
## Roadmap 🧭
117148

118149
- [x] Zero-Shot Classification with OpenAI GPT 3/4
@@ -121,6 +152,6 @@ labels = clf.predict(X)
121152
- [x] ChatGPT models
122153
- [ ] InstructGPT models
123154
- [ ] Few shot classifier
124-
- [ ] GPT Vectorizer
155+
- [x] GPT Vectorizer
125156
- [ ] GPT Fine-tuning (optional)
126157
- [ ] Integration of other LLMs

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dependencies = [
1111
"tqdm>=4.60.0",
1212
]
1313
name = "scikit-llm"
14-
version = "0.1.0b1"
14+
version = "0.1.0b2"
1515
authors = [
1616
{ name="Oleg Kostromin", email="[email protected]" },
1717
{ name="Iryna Kondrashchenko", email="[email protected]" },

skllm/models/gpt_zero_shot_clf.py

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,53 +13,38 @@
1313
extract_json_key,
1414
)
1515
from skllm.config import SKLLMConfig as _Config
16+
from skllm.utils import to_numpy as _to_numpy
17+
from skllm.openai.mixin import OpenAIMixin as _OAIMixin
1618

17-
18-
class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin):
19+
class _BaseZeroShotGPTClassifier(ABC, BaseEstimator, ClassifierMixin, _OAIMixin):
1920
def __init__(
2021
self,
2122
openai_key: Optional[str] = None,
2223
openai_org: Optional[str] = None,
2324
openai_model: str = "gpt-3.5-turbo",
2425
):
25-
self.openai_key = openai_key
26-
self.openai_org = openai_org
26+
self._set_keys(openai_key, openai_org)
2727
self.openai_model = openai_model
2828

29+
def _to_np(self, X):
30+
return _to_numpy(X)
31+
2932
def fit(
3033
self,
3134
X: Optional[Union[np.ndarray, pd.Series, List[str]]],
3235
y: Union[np.ndarray, pd.Series, List[str], List[List[str]]],
3336
):
34-
if isinstance(X, np.ndarray):
35-
X = np.squeeze(X)
37+
X = self._to_np(X)
3638
self.classes_, self.probabilities_ = self._get_unique_targets(y)
3739
return self
3840

3941
def predict(self, X: Union[np.ndarray, pd.Series, List[str]]):
40-
if isinstance(X, np.ndarray):
41-
X = np.squeeze(X)
42+
X = self._to_np(X)
4243
predictions = []
4344
for i in tqdm(range(len(X))):
4445
predictions.append(self._predict_single(X[i]))
4546
return predictions
4647

47-
def _get_openai_key(self):
48-
key = self.openai_key
49-
if key is None:
50-
key = _Config.get_openai_key()
51-
if key is None:
52-
raise RuntimeError("OpenAI key was not found")
53-
return key
54-
55-
def _get_openai_org(self):
56-
key = self.openai_org
57-
if key is None:
58-
key = _Config.get_openai_org()
59-
if key is None:
60-
raise RuntimeError("OpenAI organization was not found")
61-
return key
62-
6348
@abstractmethod
6449
def _extract_labels(self, y: Any) -> List[str]:
6550
pass
@@ -126,8 +111,7 @@ def fit(
126111
X: Optional[Union[np.ndarray, pd.Series, List[str]]],
127112
y: Union[np.ndarray, pd.Series, List[str]],
128113
):
129-
if isinstance(y, np.ndarray):
130-
y = np.squeeze(y)
114+
y = self._to_np(y)
131115
return super().fit(X, y)
132116

133117

skllm/openai/chatgpt.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import openai
22
import json
3+
from skllm.openai.credentials import set_credentials
34

45
def construct_message(role, content):
56
if role not in ("system", "user", "assistant"):
67
raise ValueError("Invalid role")
78
return {"role": role, "content": content}
89

910
def get_chat_completion(messages, key, org, model="gpt-3.5-turbo", max_retries = 3):
10-
openai.api_key = key
11-
openai.organization = org
11+
set_credentials(key, org)
1212
for _ in range(max_retries):
1313
try:
1414
completion = openai.ChatCompletion.create(

skllm/openai/credentials.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import openai
2+
3+
def set_credentials(key: str, org: str):
4+
openai.api_key = key
5+
openai.organization = org

skllm/openai/embeddings.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import openai
2+
from skllm.openai.credentials import set_credentials
3+
4+
def get_embedding(
5+
text, key: str, org: str, model="text-embedding-ada-002", max_retries=3
6+
):
7+
set_credentials(key, org)
8+
text = text.replace("\n", " ")
9+
error_msg = None
10+
for _ in range(max_retries):
11+
try:
12+
emb = openai.Embedding.create(input=[text], model=model)["data"][0][
13+
"embedding"
14+
]
15+
if not isinstance(emb, list):
16+
raise ValueError(f"Encountered unknown embedding format. Expected list, got {type(emb)}")
17+
return emb
18+
except Exception as e:
19+
error_msg = str(e)
20+
continue
21+
raise RuntimeError(
22+
f"Could not obtain the embedding after retrying {max_retries} times. \nLast captured error: `{error_msg}`"
23+
)

skllm/openai/mixin.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from typing import Optional
2+
from skllm.config import SKLLMConfig as _Config
3+
4+
class OpenAIMixin:
5+
6+
def _set_keys(self, key: Optional[str] = None, org: Optional[str] = None) -> None:
7+
self.openai_key = key
8+
self.openai_org = org
9+
10+
def _get_openai_key(self) -> str:
11+
key = self.openai_key
12+
if key is None:
13+
key = _Config.get_openai_key()
14+
if key is None:
15+
raise RuntimeError("OpenAI key was not found")
16+
return key
17+
18+
def _get_openai_org(self) -> str:
19+
key = self.openai_org
20+
if key is None:
21+
key = _Config.get_openai_org()
22+
if key is None:
23+
raise RuntimeError("OpenAI organization was not found")
24+
return key
25+

skllm/preprocessing/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from skllm.preprocessing.gpt_vectorizer import GPTVectorizer
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from sklearn.base import (
2+
BaseEstimator as _BaseEstimator,
3+
TransformerMixin as _TransformerMixin,
4+
)
5+
from typing import Any, Optional, Union, List
6+
from tqdm import tqdm
7+
import numpy as np
8+
from numpy import ndarray
9+
import pandas as pd
10+
from skllm.openai.mixin import OpenAIMixin as _OAIMixin
11+
from skllm.openai.embeddings import get_embedding as _get_embedding
12+
from skllm.utils import to_numpy as _to_numpy
13+
14+
15+
class GPTVectorizer(_BaseEstimator, _TransformerMixin, _OAIMixin):
16+
def __init__(
17+
self,
18+
openai_embedding_model: str = "text-embedding-ada-002",
19+
openai_key: Optional[str] = None,
20+
openai_org: Optional[str] = None,
21+
):
22+
self.openai_embedding_model = openai_embedding_model
23+
self._set_keys(openai_key, openai_org)
24+
25+
def fit(self, X: Any = None, y: Any = None, **kwargs):
26+
return self
27+
28+
def transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]]) -> ndarray:
29+
X = _to_numpy(X)
30+
embeddings = []
31+
for i in tqdm(range(len(X))):
32+
embeddings.append(
33+
_get_embedding(X[i], self._get_openai_key(), self._get_openai_org())
34+
)
35+
embeddings = np.asarray(embeddings)
36+
return embeddings
37+
38+
def fit_transform(self, X: Optional[Union[np.ndarray, pd.Series, List[str]]], y=None, **fit_params) -> ndarray:
39+
return self.fit(X, y).transform(X)

0 commit comments

Comments
 (0)