diff --git a/CHANGELOG.md b/CHANGELOG.md index 114ffff8..27904210 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ This is the changelog for the open source version of tiktoken. +## [v0.3.0] +- Improve performance by 5-20%; thank you to @nistath! +- Add `gpt-3.5-turbo` models to `encoding_for_model` +- Add prefix matching to `encoding_for_model` to better support future model versions +- Fix a bug in the README instructions on extending tiktoken +- Update the set of available encodings +- Add packaging metadata + ## [v0.2.0] - Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model - Improve portability of caching logic diff --git a/Cargo.toml b/Cargo.toml index 1fb806bc..40a72b94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tiktoken" -version = "0.2.0" +version = "0.3.0" edition = "2021" rust-version = "1.57.0" diff --git a/README.md b/README.md index d707e881..6a5c5f25 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Example code using `tiktoken` can be found in the ![image](./perf.svg) Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from -`tokenizers==0.13.2` and `transformers==4.24.0`. +`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`. ## Getting help diff --git a/pyproject.toml b/pyproject.toml index 771b72ac..791e3c7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "tiktoken" -version = "0.2.0" +version = "0.3.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" readme = "README.md" license = {file = "LICENSE"} diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py index ce10da46..a7d70b50 100644 --- a/tests/test_simple_public.py +++ b/tests/test_simple_public.py @@ -26,3 +26,5 @@ def test_encoding_for_model(): assert enc.name == "p50k_base" enc = tiktoken.encoding_for_model("text-davinci-edit-001") assert enc.name == "p50k_edit" + enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301") + assert enc.name == "cl100k_base" diff --git a/tiktoken/model.py b/tiktoken/model.py index 66e9e046..33da3901 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -3,8 +3,15 @@ from .core import Encoding from .registry import get_encoding -# TODO: this will likely be replaced by an API endpoint +# TODO: these will likely be replaced by an API endpoint +MODEL_PREFIX_TO_ENCODING: dict[str, str] = { + # chat + "gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc. +} + MODEL_TO_ENCODING: dict[str, str] = { + # chat + "gpt-3.5-turbo": "cl100k_base", # text "text-davinci-003": "p50k_base", "text-davinci-002": "p50k_base", @@ -45,11 +52,22 @@ def encoding_for_model(model_name: str) -> Encoding: - try: + """Returns the encoding used by a model.""" + encoding_name = None + if model_name in MODEL_TO_ENCODING: encoding_name = MODEL_TO_ENCODING[model_name] - except KeyError: + else: + # Check if the model matches a known prefix + # Prefix matching avoids needing library updates for every model version release + # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE) + for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items(): + if model_name.startswith(model_prefix): + return get_encoding(model_encoding_name) + + if encoding_name is None: raise KeyError( f"Could not automatically map {model_name} to a tokeniser. " "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect." ) from None + return get_encoding(encoding_name) diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 522d58fb..16a6ec50 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -83,6 +83,6 @@ def cl100k_base(): "gpt2": gpt2, "r50k_base": r50k_base, "p50k_base": p50k_base, - "cl100k_base": cl100k_base, "p50k_edit": p50k_edit, + "cl100k_base": cl100k_base, }