From 87b673e6cea6abb26cdc88aa124a4ee9ec3ebef8 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Dec 2023 14:34:56 +0800 Subject: [PATCH] Use language codes in `HEAD_TAG_RE` pattern This bug was introduced from pr #393, fixes #405. --- src/wiktextract/extractor/en/page.py | 18 +++++++++--------- tests/test_head.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index b1d0ee9e..62bfcb6a 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -46,7 +46,7 @@ from .unsupported_titles import unsupported_title_map # Matches head tag -head_tag_re = None +HEAD_TAG_RE = None FLOATING_TABLE_TEMPLATES = { # az-suffix-form creates a style=floatright div that is otherwise @@ -535,14 +535,15 @@ def parse_sense_linkage(wxr, data, name, ht): data_append(data, field, dt) -def init_head_tag_re(wxr): - global head_tag_re - if head_tag_re is None: - head_tag_re = re.compile( +def init_head_tag_re(): + global HEAD_TAG_RE + if HEAD_TAG_RE is None: + HEAD_TAG_RE = re.compile( r"^(head|Han char|arabic-noun|arabic-noun-form|" r"hangul-symbol|syllable-hangul)$|" + r"^(latin|" + - "|".join(lang_name for _, lang_name in get_all_names("en")) + r")-(" + + "|".join(lang_code for lang_code, *_ in get_all_names("en")) + + r")-(" + "|".join([ "abbr", "adj", @@ -720,7 +721,7 @@ def parse_language(wxr, langnode, language, lang_code): assert isinstance(lang_code, str) # print("parse_language", language) - init_head_tag_re(wxr) + init_head_tag_re() is_reconstruction = False word = wxr.wtp.title unsupported_prefix = "Unsupported titles/" @@ -879,8 +880,7 @@ def head_post_template_fn(name, ht, expansion): data_append(pos_data, "tags", "Pinyin") elif t == "romanization": data_append(pos_data, "tags", "romanization") - m = re.search(head_tag_re, name) - if m: + if HEAD_TAG_RE.fullmatch(name) is not None: args_ht = clean_template_args(wxr, ht) cleaned_expansion = clean_node(wxr, None, expansion) dt = {"name": name, "args": args_ht, "expansion": cleaned_expansion} diff --git a/tests/test_head.py b/tests/test_head.py index 31056618..b402eb03 100644 --- a/tests/test_head.py +++ b/tests/test_head.py @@ -699,3 +699,15 @@ def test_head35(self): "plural" ] }) + + + def test_head_templates_regex(self): + # GitHub issue 405 + import re + + from wiktextract.extractor.en.page import init_head_tag_re + + init_head_tag_re() + from wiktextract.extractor.en.page import HEAD_TAG_RE + + self.assertTrue(HEAD_TAG_RE.fullmatch("ru-noun+") is not None)