From 446cb49affe9955eba1eb687f2eb9580c501a3e5 Mon Sep 17 00:00:00 2001 From: Shantanu Jain Date: Thu, 16 Mar 2023 18:11:50 -0700 Subject: [PATCH] Bump version, sync codebase --- CHANGELOG.md | 3 +++ Cargo.toml | 2 +- README.md | 4 ++-- src/lib.rs | 2 +- tiktoken/model.py | 4 +++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7dce9d9..d0365b8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ This is the changelog for the open source version of tiktoken. +## [v0.3.2] +- Add encoding for GPT-4 + ## [v0.3.1] - Build aarch64 wheels - Make `blobfile` an optional dependency diff --git a/Cargo.toml b/Cargo.toml index 912af00f..07182cd4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.3.1" +version = "0.3.2" edition = "2021" rust-version = "1.57.0" diff --git a/README.md b/README.md index 6a5c5f25..c96f1b4d 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,11 @@ OpenAI's models. ```python import tiktoken -enc = tiktoken.get_encoding("gpt2") +enc = tiktoken.get_encoding("cl100k_base") assert enc.decode(enc.encode("hello world")) == "hello world" # To get the tokeniser corresponding to a specific model in the OpenAI API: -enc = tiktoken.encoding_for_model("text-davinci-003") +enc = tiktoken.encoding_for_model("gpt-4") ``` The open source version of `tiktoken` can be installed from PyPI: diff --git a/src/lib.rs b/src/lib.rs index f391005c..70009d28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,7 +34,7 @@ fn _byte_pair_merge( } }; - // We look up the ranks once in the beggining and iteratively update + // We look up the ranks once in the beginning and iteratively update // them during each merge, which reduces the number of rank lookups. for i in 0..parts.len() - 2 { match get_rank(&parts, i, 0) { diff --git a/tiktoken/model.py b/tiktoken/model.py index 33da3901..b8af7875 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -6,11 +6,13 @@ # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { # chat - "gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc. + "gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k + "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. } MODEL_TO_ENCODING: dict[str, str] = { # chat + "gpt-4": "cl100k_base", "gpt-3.5-turbo": "cl100k_base", # text "text-davinci-003": "p50k_base",