Skip to content

Commit d6a2791

Browse files
authored
Merge pull request #1131 from parea-ai/update-num_tokens_from_messages
add new openai models to tokenizer
2 parents 5d3a879 + b9c77c0 commit d6a2791

File tree

2 files changed

+22
-5
lines changed

2 files changed

+22
-5
lines changed

parea/wrapper/utils.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,18 @@ def _num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613", is_azure: bo
6666
if (
6767
model
6868
in {
69-
"gpt-3.5-turbo",
70-
"gpt-3.5-turbo-1106",
7169
"gpt-3.5-turbo-0125",
70+
"gpt-3.5-turbo-1106",
7271
"gpt-3.5-turbo-0613",
7372
"gpt-3.5-turbo-16k-0613",
7473
"gpt-4-0314",
7574
"gpt-4-32k-0314",
7675
"gpt-4-0613",
7776
"gpt-4-32k-0613",
78-
"gpt-4-turbo-preview",
7977
"gpt-4-1106-preview",
8078
"gpt-4-0125-preview",
79+
"gpt-4o-mini-2024-07-18",
80+
"gpt-4o-2024-08-06",
8181
}
8282
or is_azure
8383
):
@@ -86,6 +86,22 @@ def _num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613", is_azure: bo
8686
elif model == "gpt-3.5-turbo-0301":
8787
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
8888
tokens_per_name = -1 # if there's a name, the role is omitted
89+
elif "gpt-3.5-turbo" in model:
90+
print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.")
91+
return _num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125")
92+
elif model in ["gpt-4o-mini", "o1-mini", "o1-mini-2024-09-12"]:
93+
print("Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.")
94+
return _num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18")
95+
elif model in [
96+
"gpt-4o",
97+
"chatgpt-4o-latest",
98+
"o1-preview",
99+
"o1-preview-2024-09-12",
100+
"o1",
101+
"o1-2024-12-17",
102+
]:
103+
print("Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.")
104+
return _num_tokens_from_messages(messages, model="gpt-4o-2024-08-06")
89105
elif "gpt-4" in model:
90106
print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
91107
return _num_tokens_from_messages(messages, model="gpt-4-0613")
@@ -100,7 +116,8 @@ def _num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613", is_azure: bo
100116
for message in messages:
101117
num_tokens += tokens_per_message
102118
for key, value in message.items():
103-
num_tokens += _safe_encode(encoding, value)
119+
value_str = value if isinstance(value, str) else json.dumps(value)
120+
num_tokens += _safe_encode(encoding, value_str)
104121
if key == "name":
105122
num_tokens += tokens_per_name
106123
num_tokens += 3 # every reply is primed with <|start|>assistant<|message|>

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
66
[tool.poetry]
77
name = "parea-ai"
88
packages = [{ include = "parea" }]
9-
version = "0.2.218"
9+
version = "0.2.219"
1010
description = "Parea python sdk"
1111
readme = "README.md"
1212
authors = ["joel-parea-ai <[email protected]>"]

0 commit comments

Comments
 (0)