From 60559a3aa57eb7e1ff5b4ab41a486c59aae6b30d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 21 Jan 2025 11:24:14 +0800 Subject: [PATCH 1/5] [th] fix "col" template code for some Chinese word pages --- src/wiktextract/extractor/th/linkage.py | 16 +++++++++---- tests/test_th_linkage.py | 31 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/th/linkage.py b/src/wiktextract/extractor/th/linkage.py index e08070ce..9037e109 100644 --- a/src/wiktextract/extractor/th/linkage.py +++ b/src/wiktextract/extractor/th/linkage.py @@ -46,15 +46,21 @@ def extract_col_template( wxr.wtp.node_to_wikitext(t_node), expand_all=True ) for li_tag in expanded_node.find_html_recursively("li"): - l_data = Linkage(word="", source=source) + l_data = [] for span_tag in li_tag.find_html("span"): span_class = span_tag.attrs.get("class", "") if "Latn" in span_class: - l_data.roman = clean_node(wxr, None, span_tag) + for data in l_data: + data.roman = clean_node(wxr, None, span_tag) elif "lang" in span_tag.attrs: - l_data.word = clean_node(wxr, None, span_tag) - if l_data.word != "": - getattr(word_entry, linkage_type).append(l_data) + word = clean_node(wxr, None, span_tag) + if word != "": + l_data.append(Linkage(word=word, source=source)) + if span_class == "Hant": + l_data[-1].tags.append("Traditional Chinese") + elif span_class == "Hans": + l_data[-1].tags.append("Simplified Chinese") + getattr(word_entry, linkage_type).extend(l_data) def extract_linkage_lite_item( diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py index a9c06651..782c69ad 100644 --- a/tests/test_th_linkage.py +++ b/tests/test_th_linkage.py @@ -97,3 +97,34 @@ def test_syn_template(self): page_data[0]["synonyms"], [{"word": "ทีวี"}, {"word": "โทรภาพ"}], ) + + def test_col3_zh_pinyin(self): + self.wxr.wtp.add_page( + "แม่แบบ:col3", + 10, + """
  • [[電腦遊戲#ภาษาจีน|電腦遊戲]] / [[电脑游戏#ภาษาจีน|电脑游戏]] (diànnǎo yóuxì)
""", + ) + page_data = parse_page( + self.wxr, + "電腦", + """== ภาษาจีน == +=== คำนาม === +# [[คอมพิวเตอร์]] +==== ลูกคำ ==== +{{col3|zh|電腦遊戲}}""", + ) + self.assertEqual( + page_data[0]["derived"], + [ + { + "word": "電腦遊戲", + "roman": "diànnǎo yóuxì", + "tags": ["Traditional Chinese"], + }, + { + "word": "电脑游戏", + "roman": "diànnǎo yóuxì", + "tags": ["Simplified Chinese"], + }, + ], + ) From 657201013ccb5c74467daba276dd96694c37725c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 21 Jan 2025 12:03:13 +0800 Subject: [PATCH 2/5] [th] fix alternate forms added to wrong WordEntry bug --- src/wiktextract/extractor/th/alt_form.py | 20 ++++++++++++++++++-- src/wiktextract/extractor/th/page.py | 4 +--- tests/test_th_gloss.py | 22 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py index fff7052e..b0ea8d66 100644 --- a/src/wiktextract/extractor/th/alt_form.py +++ b/src/wiktextract/extractor/th/alt_form.py @@ -17,6 +17,11 @@ def extract_alt_form_section( and node.template_name == "alt" ): extract_alt_template(wxr, word_entry, node) + elif isinstance(node, TemplateNode) and node.template_name in [ + "l", + "link", + ]: + extract_l_template(wxr, word_entry, node) for t_node in level_node.find_child(NodeKind.TEMPLATE): if t_node.template_name == "lo-alt": @@ -52,8 +57,9 @@ def extract_alt_expanded_nodes( span_lang = span_tag.attrs.get("lang", "") if span_lang == lang_code: form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags) - translate_raw_tags(form) - word_entry.forms.append(form) + if form.form != "": + translate_raw_tags(form) + word_entry.forms.append(form) elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0: word_entry.forms[-1].roman = clean_node(wxr, None, span_tag) @@ -69,3 +75,13 @@ def extract_lo_alt_template( for list_node in expanded_node.find_child(NodeKind.LIST): for list_item in list_node.find_child(NodeKind.LIST_ITEM): extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo") + + +def extract_l_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + form = Form( + form=clean_node(wxr, None, t_node.template_parameters.get(2, "")) + ) + if form.form != "": + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py index 0fd76816..962c80c8 100644 --- a/src/wiktextract/extractor/th/page.py +++ b/src/wiktextract/extractor/th/page.py @@ -50,9 +50,7 @@ def parse_section( elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")): extract_sound_section(wxr, base_data, level_node) elif title_text == "รูปแบบอื่น": - extract_alt_form_section( - wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node - ) + extract_alt_form_section(wxr, base_data, level_node) elif title_text == "การใช้": extract_note_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py index 1b106d01..f44dc587 100644 --- a/tests/test_th_gloss.py +++ b/tests/test_th_gloss.py @@ -258,3 +258,25 @@ def test_alt_form_template(self): "tags": ["form-of"], }, ) + + def test_alt_form_second_language_section(self): + self.wxr.wtp.add_page( + "แม่แบบ:alt", + 10, + """(''เลิกใช้'') [[เดอร#ภาษาไทย|เดอร]], [[เดิร#ภาษาไทย|เดิร]]""", + ) + page_data = parse_page( + self.wxr, + "ข้าว", + """== ภาษาไทย == +=== คำกริยา === +# [[ชื่อ]] + +== ภาษาญ้อ == +=== รูปแบบอื่น === +* {{l|nyw|เข้า}} +=== คำนาม === +# [[ข้าว]]""", + ) + self.assertTrue("forms" not in page_data[0]) + self.assertEqual(page_data[1]["forms"], [{"form": "เข้า"}]) From 9acd1a2a0396f1be64445151139708f46e43e91f Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 21 Jan 2025 15:07:55 +0800 Subject: [PATCH 3/5] [th] handle translation pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit there are just two "*/คำแปลภาษาอื่น" pages --- src/wiktextract/extractor/th/page.py | 4 +++ src/wiktextract/extractor/th/translation.py | 38 +++++++++++++++++++-- src/wiktextract/wiktionary.py | 6 ++-- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py index 962c80c8..074434fa 100644 --- a/src/wiktextract/extractor/th/page.py +++ b/src/wiktextract/extractor/th/page.py @@ -67,6 +67,10 @@ def parse_page( ) -> list[dict[str, Any]]: # page layout # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน + + # skip translation pages + if page_title.endswith("/คำแปลภาษาอื่น"): + return [] wxr.wtp.start_page(page_title) tree = wxr.wtp.parse(page_text, pre_expand=True) page_data: list[WordEntry] = [] diff --git a/src/wiktextract/extractor/th/translation.py b/src/wiktextract/extractor/th/translation.py index 93883fa5..51866f6d 100644 --- a/src/wiktextract/extractor/th/translation.py +++ b/src/wiktextract/extractor/th/translation.py @@ -34,12 +34,16 @@ def extract_translation_list_item( clean_node(wxr, None, list_item.children[:index]) + node[: node.index(":")].strip() ) - lang_code = name_to_code(lang_name, "th") - if lang_code == "": - lang_code = "unknown" + if lang_name == "": + lang_name = "unknown" + if lang_name != "unknown": + lang_code = name_to_code(lang_name, "th") + if lang_code == "": + lang_code = "unknown" elif isinstance(node, TemplateNode) and node.template_name in [ "t", "t+", + "t-simple", ]: extract_t_template(wxr, word_entry, node, lang_name, sense) elif ( @@ -62,6 +66,11 @@ def extract_translation_list_item( extract_translation_list_item( wxr, word_entry, child_list_item, sense ) + elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + for link_node in node.find_child(NodeKind.LINK): + link_str = clean_node(wxr, None, link_node) + if link_str.endswith("/คำแปลภาษาอื่น"): + extract_translation_page(wxr, word_entry, link_str) def extract_t_template( @@ -99,3 +108,26 @@ def extract_t_template( word_entry.translations.append(tr_data) for link_node in expanded_node.find_child(NodeKind.LINK): clean_node(wxr, word_entry, link_node) + + +def extract_translation_page( + wxr: WiktextractContext, + word_entry: WordEntry, + page_title: str, +) -> None: + page = wxr.wtp.get_page(page_title, 0) + if page is None or page.body is None: + return + root = wxr.wtp.parse(page.body) + for level2_node in root.find_child(NodeKind.LEVEL2): + lang_name = clean_node(wxr, None, level2_node.largs).removeprefix( + "ภาษา" + ) + if lang_name != word_entry.lang: + continue + for level3_node in level2_node.find_child(NodeKind.LEVEL3): + pos_title = clean_node(wxr, None, level3_node.largs) + if pos_title != word_entry.pos_title: + continue + for tr_level_node in level3_node.find_child(NodeKind.LEVEL4): + extract_translation_section(wxr, word_entry, tr_level_node) diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py index fe1b1031..347265d0 100644 --- a/src/wiktextract/wiktionary.py +++ b/src/wiktextract/wiktionary.py @@ -301,7 +301,7 @@ def check_str_fields( "mandatory field): {}".format( field, "" if empty_ok else " non-empty", - json.dumps(item, sort_keys=True), + json.dumps(item, sort_keys=True, ensure_ascii=False), ), ) continue @@ -315,7 +315,7 @@ def check_str_fields( "{!r} should be a{} string: {}".format( field, "" if empty_ok else " non-empty", - json.dumps(item, sort_keys=True), + json.dumps(item, sort_keys=True, ensure_ascii=False), ), ) if not v and not empty_ok: @@ -326,7 +326,7 @@ def check_str_fields( lang, pos, "{!r} should contain a non-empty string: {}".format( - field, json.dumps(item, sort_keys=True) + field, json.dumps(item, sort_keys=True, ensure_ascii=False) ), ) From 2577d25fabe50d1eab5539ef958ff528c2a25422 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 21 Jan 2025 16:10:28 +0800 Subject: [PATCH 4/5] [th] handle alt forms section after pos layout --- src/wiktextract/extractor/th/page.py | 10 +++++++++- src/wiktextract/extractor/th/pos.py | 1 + tests/test_th_gloss.py | 24 ++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py index 074434fa..14393f74 100644 --- a/src/wiktextract/extractor/th/page.py +++ b/src/wiktextract/extractor/th/page.py @@ -50,7 +50,15 @@ def parse_section( elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")): extract_sound_section(wxr, base_data, level_node) elif title_text == "รูปแบบอื่น": - extract_alt_form_section(wxr, base_data, level_node) + extract_alt_form_section( + wxr, + page_data[-1] + if len(page_data) > 0 + and page_data[-1].lang_code == base_data.lang_code + and page_data[-1].pos == base_data.pos + else base_data, + level_node, + ) elif title_text == "การใช้": extract_note_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py index 5e08e9c8..ca6c3fa3 100644 --- a/src/wiktextract/extractor/th/pos.py +++ b/src/wiktextract/extractor/th/pos.py @@ -26,6 +26,7 @@ def extract_pos_section( page_data[-1].pos_title = pos_title pos_data = POS_DATA[pos_title] page_data[-1].pos = pos_data["pos"] + base_data.pos = pos_data["pos"] page_data[-1].tags.extend(pos_data.get("tags", [])) gloss_list_index = len(level_node.children) diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py index f44dc587..c95fb919 100644 --- a/tests/test_th_gloss.py +++ b/tests/test_th_gloss.py @@ -280,3 +280,27 @@ def test_alt_form_second_language_section(self): ) self.assertTrue("forms" not in page_data[0]) self.assertEqual(page_data[1]["forms"], [{"form": "เข้า"}]) + + def test_alt_form_after_pos(self): + self.wxr.wtp.add_page( + "แม่แบบ:lo-alt", + 10, + """* (''ล้าสมัย'') [[ໄທຍ໌#ภาษาลาว|ໄທຍ໌]] (ไทย์)""", + ) + page_data = parse_page( + self.wxr, + "ໄທ", + """== ภาษาลาว == +=== คำนาม === +# [[ไทย]] + +=== คำวิสามานยนาม === +# [[ไทย]] +==== รูปแบบอื่น ==== +{{lo-alt|d=ໄທຍ}}""", + ) + self.assertTrue("forms" not in page_data[0]) + self.assertEqual( + page_data[1]["forms"], + [{"form": "ໄທຍ໌", "raw_tags": ["ล้าสมัย"], "roman": "ไทย์"}], + ) From 3649fb64f17e9d8351f466ccbdc24e25b4ad65c8 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 21 Jan 2025 16:45:24 +0800 Subject: [PATCH 5/5] [th] extract "lo-pron" sound template --- src/wiktextract/extractor/th/models.py | 2 + src/wiktextract/extractor/th/sound.py | 58 +++++++++++++++++++++++++- tests/test_th_sound.py | 36 ++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index 0eef519a..e9065c51 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -97,6 +97,7 @@ class Sound(ThaiBaseModel): homophone: str = "" other: str = "" roman: str = "" + rhymes: str = "" class WordEntry(ThaiBaseModel): @@ -126,3 +127,4 @@ class WordEntry(ThaiBaseModel): idioms: list[Linkage] = [] coordinate_terms: list[Linkage] = [] sounds: list[Sound] = [] + hyphenation: list[str] = [] diff --git a/src/wiktextract/extractor/th/sound.py b/src/wiktextract/extractor/th/sound.py index 6c532909..fab693aa 100644 --- a/src/wiktextract/extractor/th/sound.py +++ b/src/wiktextract/extractor/th/sound.py @@ -1,7 +1,13 @@ import re from dataclasses import dataclass -from wikitextprocessor import LevelNode, NodeKind, TemplateNode +from wikitextprocessor import ( + HTMLNode, + LevelNode, + NodeKind, + TemplateNode, + WikiNode, +) from ...page import clean_node from ...wxr_context import WiktextractContext @@ -18,6 +24,8 @@ def extract_sound_section( for t_node in level_node.find_child(NodeKind.TEMPLATE): if t_node.template_name == "th-pron": extract_th_pron_template(wxr, base_data, t_node) + elif t_node.template_name == "lo-pron": + extract_lo_pron_template(wxr, base_data, t_node) @dataclass @@ -86,3 +94,51 @@ def extract_th_pron_template( base_data.sounds.append(sound) clean_node(wxr, base_data, expanded_node) + + +def extract_lo_pron_template( + wxr: WiktextractContext, + base_data: WordEntry, + t_node: TemplateNode, +) -> None: + # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for list_node in expanded_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + field = "other" + raw_tag = "" + for node in list_item.children: + if isinstance(node, HTMLNode) and node.tag == "span": + span_class = node.attrs.get("class", "") + if "qualifier-content" in span_class: + raw_tag = clean_node(wxr, None, node) + elif span_class == "IPA": + ipa = clean_node(wxr, None, node) + if ipa != "": + sound = Sound(ipa=ipa) + if raw_tag != "": + sound.raw_tags.append(raw_tag) + translate_raw_tags(sound) + base_data.sounds.append(sound) + else: + span_lang = node.attrs.get("lang", "") + if span_lang == "lo" and field == "hyphenation": + span_str = clean_node(wxr, None, node) + if span_str != "": + base_data.hyphenation.append(span_str) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + link_str = clean_node(wxr, None, node) + if link_str == "สัทอักษรสากล": + field = "ipa" + elif link_str != "" and field == "rhymes": + base_data.sounds.append(Sound(rhymes=link_str)) + elif isinstance(node, str) and node.strip().endswith(":"): + node = node.strip() + if node == "การแบ่งพยางค์:": + field = "hyphenation" + elif node == "สัมผัส:": + field = "rhymes" + + clean_node(wxr, base_data, expanded_node) diff --git a/tests/test_th_sound.py b/tests/test_th_sound.py index 808826bf..969ab378 100644 --- a/tests/test_th_sound.py +++ b/tests/test_th_sound.py @@ -68,3 +68,39 @@ def test_th_pron(self): "ศัพท์ภาษาไทยที่มี 1 พยางค์", ], ) + + def test_lo_pron(self): + self.wxr.wtp.add_page( + "แม่แบบ:lo-pron", + 10, + """* (เวียงจันทน์) [[วิกิพจนานุกรม:สัทอักษรสากล|สัทอักษรสากล]]([[wikipedia:ระบบเสียงภาษาลาว|คำอธิบาย]]): [tʰaj˧˥][[Category:ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ|ໄທ]][[Category:ศัพท์ภาษาลาวที่มี 1 พยางค์|ໄທ]] +* (หลวงพระบาง) [[วิกิพจนานุกรม:สัทอักษรสากล|สัทอักษรสากล]]([[wikipedia:ระบบเสียงภาษาลาว|คำอธิบาย]]): [tʰaj˩˨][[Category:ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ|ໄທ]][[Category:ศัพท์ภาษาลาวที่มี 1 พยางค์|ໄທ]] +* การแบ่งพยางค์: ໄທ +* สัมผัส: [[:หมวดหมู่:สัมผัส:ภาษาลาว/aj|-aj]][[Category:สัมผัส:ภาษาลาว/aj|ໄທ]]""", + ) + data = parse_page( + self.wxr, + "ໄທ", + """== ภาษาลาว == +=== การออกเสียง === +{{lo-pron}} +=== คำนาม === +# [[ไทย]]""", + ) + self.assertEqual(data[0]["hyphenation"], ["ໄທ"]) + self.assertEqual( + data[0]["sounds"], + [ + {"ipa": "[tʰaj˧˥]", "raw_tags": ["เวียงจันทน์"]}, + {"ipa": "[tʰaj˩˨]", "raw_tags": ["หลวงพระบาง"]}, + {"rhymes": "-aj"}, + ], + ) + self.assertEqual( + data[0]["categories"], + [ + "ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ", + "ศัพท์ภาษาลาวที่มี 1 พยางค์", + "สัมผัส:ภาษาลาว/aj", + ], + )