From 6cc3a46c8dd43a913612b9c839a51936a2c6bd41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc=20Pap?=
 <1841944+paplorinc@users.noreply.github.com>
Date: Fri, 9 Feb 2024 03:17:22 +0100
Subject: [PATCH] Optimize regular expressions used for splitting by ~20%
 (#234)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By combining the contractions to a single non-capturing group prefixed
by `'`, we can speed up matches by roughly 20%.

By using possessive quantifiers for the `cl100k_base` in the word and
punctuation groups we're avoiding some backtracking.

The last whitespace groups can also be simplified to have a single
newline matched explicitly, since the previous whitespace would already
match it.

Overall the regex matches the exact same sequence of characters as
before for any case and for unicode sequences.

Co-authored-by: Lőrinc <lorinc.pap@gmail.com>
---
 tiktoken_ext/openai_public.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
index 4e847b0e..330ecabb 100644
--- a/tiktoken_ext/openai_public.py
+++ b/tiktoken_ext/openai_public.py
@@ -17,7 +17,10 @@ def gpt2():
     return {
         "name": "gpt2",
         "explicit_n_vocab": 50257,
-        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        # The pattern in the original GPT-2 release is:
+        # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+        # This is equivalent, but executes faster:
+        "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         "mergeable_ranks": mergeable_ranks,
         "special_tokens": {ENDOFTEXT: 50256},
     }
@@ -31,7 +34,7 @@ def r50k_base():
     return {
         "name": "r50k_base",
         "explicit_n_vocab": 50257,
-        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         "mergeable_ranks": mergeable_ranks,
         "special_tokens": {ENDOFTEXT: 50256},
     }
@@ -45,7 +48,7 @@ def p50k_base():
     return {
         "name": "p50k_base",
         "explicit_n_vocab": 50281,
-        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         "mergeable_ranks": mergeable_ranks,
         "special_tokens": {ENDOFTEXT: 50256},
     }
@@ -59,7 +62,7 @@ def p50k_edit():
     special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
     return {
         "name": "p50k_edit",
-        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         "mergeable_ranks": mergeable_ranks,
         "special_tokens": special_tokens,
     }
@@ -79,7 +82,7 @@ def cl100k_base():
     }
     return {
         "name": "cl100k_base",
-        "pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+        "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
         "mergeable_ranks": mergeable_ranks,
         "special_tokens": special_tokens,
     }