Merge pull request #1001 from xxyzz/th

[th] improve linkage, sound, alt form, translation section code
tatuylonen · Jan 21, 2025 · 0c0c1f1 · 0c0c1f1
2 parents c47ebf4 + 3649fb6
commit 0c0c1f1
Show file tree

Hide file tree

Showing 11 changed files with 251 additions and 15 deletions.
diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py
@@ -17,6 +17,11 @@ def extract_alt_form_section(
                     and node.template_name == "alt"
                 ):
                     extract_alt_template(wxr, word_entry, node)
+                elif isinstance(node, TemplateNode) and node.template_name in [
+                    "l",
+                    "link",
+                ]:
+                    extract_l_template(wxr, word_entry, node)
 
     for t_node in level_node.find_child(NodeKind.TEMPLATE):
         if t_node.template_name == "lo-alt":
@@ -52,8 +57,9 @@ def extract_alt_expanded_nodes(
         span_lang = span_tag.attrs.get("lang", "")
         if span_lang == lang_code:
             form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
-            translate_raw_tags(form)
-            word_entry.forms.append(form)
+            if form.form != "":
+                translate_raw_tags(form)
+                word_entry.forms.append(form)
         elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0:
             word_entry.forms[-1].roman = clean_node(wxr, None, span_tag)
 
@@ -69,3 +75,13 @@ def extract_lo_alt_template(
     for list_node in expanded_node.find_child(NodeKind.LIST):
         for list_item in list_node.find_child(NodeKind.LIST_ITEM):
             extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo")
+
+
+def extract_l_template(
+    wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+    form = Form(
+        form=clean_node(wxr, None, t_node.template_parameters.get(2, ""))
+    )
+    if form.form != "":
+        word_entry.forms.append(form)
diff --git a/src/wiktextract/extractor/th/linkage.py b/src/wiktextract/extractor/th/linkage.py
@@ -46,15 +46,21 @@ def extract_col_template(
         wxr.wtp.node_to_wikitext(t_node), expand_all=True
     )
     for li_tag in expanded_node.find_html_recursively("li"):
-        l_data = Linkage(word="", source=source)
+        l_data = []
         for span_tag in li_tag.find_html("span"):
             span_class = span_tag.attrs.get("class", "")
             if "Latn" in span_class:
-                l_data.roman = clean_node(wxr, None, span_tag)
+                for data in l_data:
+                    data.roman = clean_node(wxr, None, span_tag)
             elif "lang" in span_tag.attrs:
-                l_data.word = clean_node(wxr, None, span_tag)
-        if l_data.word != "":
-            getattr(word_entry, linkage_type).append(l_data)
+                word = clean_node(wxr, None, span_tag)
+                if word != "":
+                    l_data.append(Linkage(word=word, source=source))
+                    if span_class == "Hant":
+                        l_data[-1].tags.append("Traditional Chinese")
+                    elif span_class == "Hans":
+                        l_data[-1].tags.append("Simplified Chinese")
+        getattr(word_entry, linkage_type).extend(l_data)
 
 
 def extract_linkage_lite_item(

diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
@@ -97,6 +97,7 @@ class Sound(ThaiBaseModel):
     homophone: str = ""
     other: str = ""
     roman: str = ""
+    rhymes: str = ""
 
 
 class WordEntry(ThaiBaseModel):
@@ -126,3 +127,4 @@ class WordEntry(ThaiBaseModel):
     idioms: list[Linkage] = []
     coordinate_terms: list[Linkage] = []
     sounds: list[Sound] = []
+    hyphenation: list[str] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
@@ -51,7 +51,13 @@ def parse_section(
         extract_sound_section(wxr, base_data, level_node)
     elif title_text == "รูปแบบอื่น":
         extract_alt_form_section(
-            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+            wxr,
+            page_data[-1]
+            if len(page_data) > 0
+            and page_data[-1].lang_code == base_data.lang_code
+            and page_data[-1].pos == base_data.pos
+            else base_data,
+            level_node,
         )
     elif title_text == "การใช้":
         extract_note_section(
@@ -69,6 +75,10 @@ def parse_page(
 ) -> list[dict[str, Any]]:
     # page layout
     # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
+
+    # skip translation pages
+    if page_title.endswith("/คำแปลภาษาอื่น"):
+        return []
     wxr.wtp.start_page(page_title)
     tree = wxr.wtp.parse(page_text, pre_expand=True)
     page_data: list[WordEntry] = []

diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
@@ -26,6 +26,7 @@ def extract_pos_section(
     page_data[-1].pos_title = pos_title
     pos_data = POS_DATA[pos_title]
     page_data[-1].pos = pos_data["pos"]
+    base_data.pos = pos_data["pos"]
     page_data[-1].tags.extend(pos_data.get("tags", []))
 
     gloss_list_index = len(level_node.children)

diff --git a/src/wiktextract/extractor/th/sound.py b/src/wiktextract/extractor/th/sound.py
@@ -1,7 +1,13 @@
 import re
 from dataclasses import dataclass
 
-from wikitextprocessor import LevelNode, NodeKind, TemplateNode
+from wikitextprocessor import (
+    HTMLNode,
+    LevelNode,
+    NodeKind,
+    TemplateNode,
+    WikiNode,
+)
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
@@ -18,6 +24,8 @@ def extract_sound_section(
     for t_node in level_node.find_child(NodeKind.TEMPLATE):
         if t_node.template_name == "th-pron":
             extract_th_pron_template(wxr, base_data, t_node)
+        elif t_node.template_name == "lo-pron":
+            extract_lo_pron_template(wxr, base_data, t_node)
 
 
 @dataclass
@@ -86,3 +94,51 @@ def extract_th_pron_template(
                         base_data.sounds.append(sound)
 
     clean_node(wxr, base_data, expanded_node)
+
+
+def extract_lo_pron_template(
+    wxr: WiktextractContext,
+    base_data: WordEntry,
+    t_node: TemplateNode,
+) -> None:
+    # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for list_node in expanded_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            field = "other"
+            raw_tag = ""
+            for node in list_item.children:
+                if isinstance(node, HTMLNode) and node.tag == "span":
+                    span_class = node.attrs.get("class", "")
+                    if "qualifier-content" in span_class:
+                        raw_tag = clean_node(wxr, None, node)
+                    elif span_class == "IPA":
+                        ipa = clean_node(wxr, None, node)
+                        if ipa != "":
+                            sound = Sound(ipa=ipa)
+                            if raw_tag != "":
+                                sound.raw_tags.append(raw_tag)
+                                translate_raw_tags(sound)
+                            base_data.sounds.append(sound)
+                    else:
+                        span_lang = node.attrs.get("lang", "")
+                        if span_lang == "lo" and field == "hyphenation":
+                            span_str = clean_node(wxr, None, node)
+                            if span_str != "":
+                                base_data.hyphenation.append(span_str)
+                elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
+                    link_str = clean_node(wxr, None, node)
+                    if link_str == "สัทอักษรสากล":
+                        field = "ipa"
+                    elif link_str != "" and field == "rhymes":
+                        base_data.sounds.append(Sound(rhymes=link_str))
+                elif isinstance(node, str) and node.strip().endswith(":"):
+                    node = node.strip()
+                    if node == "การแบ่งพยางค์:":
+                        field = "hyphenation"
+                    elif node == "สัมผัส:":
+                        field = "rhymes"
+
+    clean_node(wxr, base_data, expanded_node)
diff --git a/src/wiktextract/extractor/th/translation.py b/src/wiktextract/extractor/th/translation.py
@@ -34,12 +34,16 @@ def extract_translation_list_item(
                 clean_node(wxr, None, list_item.children[:index])
                 + node[: node.index(":")].strip()
             )
-            lang_code = name_to_code(lang_name, "th")
-            if lang_code == "":
-                lang_code = "unknown"
+            if lang_name == "":
+                lang_name = "unknown"
+            if lang_name != "unknown":
+                lang_code = name_to_code(lang_name, "th")
+                if lang_code == "":
+                    lang_code = "unknown"
         elif isinstance(node, TemplateNode) and node.template_name in [
             "t",
             "t+",
+            "t-simple",
         ]:
             extract_t_template(wxr, word_entry, node, lang_name, sense)
         elif (
@@ -62,6 +66,11 @@ def extract_translation_list_item(
                 extract_translation_list_item(
                     wxr, word_entry, child_list_item, sense
                 )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+            for link_node in node.find_child(NodeKind.LINK):
+                link_str = clean_node(wxr, None, link_node)
+                if link_str.endswith("/คำแปลภาษาอื่น"):
+                    extract_translation_page(wxr, word_entry, link_str)
 
 
 def extract_t_template(
@@ -99,3 +108,26 @@ def extract_t_template(
         word_entry.translations.append(tr_data)
         for link_node in expanded_node.find_child(NodeKind.LINK):
             clean_node(wxr, word_entry, link_node)
+
+
+def extract_translation_page(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    page_title: str,
+) -> None:
+    page = wxr.wtp.get_page(page_title, 0)
+    if page is None or page.body is None:
+        return
+    root = wxr.wtp.parse(page.body)
+    for level2_node in root.find_child(NodeKind.LEVEL2):
+        lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
+            "ภาษา"
+        )
+        if lang_name != word_entry.lang:
+            continue
+        for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+            pos_title = clean_node(wxr, None, level3_node.largs)
+            if pos_title != word_entry.pos_title:
+                continue
+            for tr_level_node in level3_node.find_child(NodeKind.LEVEL4):
+                extract_translation_section(wxr, word_entry, tr_level_node)
diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
@@ -301,7 +301,7 @@ def check_str_fields(
                     "mandatory field): {}".format(
                         field,
                         "" if empty_ok else " non-empty",
-                        json.dumps(item, sort_keys=True),
+                        json.dumps(item, sort_keys=True, ensure_ascii=False),
                     ),
                 )
             continue
@@ -315,7 +315,7 @@ def check_str_fields(
                 "{!r} should be a{} string: {}".format(
                     field,
                     "" if empty_ok else " non-empty",
-                    json.dumps(item, sort_keys=True),
+                    json.dumps(item, sort_keys=True, ensure_ascii=False),
                 ),
             )
         if not v and not empty_ok:
@@ -326,7 +326,7 @@ def check_str_fields(
                 lang,
                 pos,
                 "{!r} should contain a non-empty string: {}".format(
-                    field, json.dumps(item, sort_keys=True)
+                    field, json.dumps(item, sort_keys=True, ensure_ascii=False)
                 ),
             )
 

diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
@@ -258,3 +258,49 @@ def test_alt_form_template(self):
                 "tags": ["form-of"],
             },
         )
+
+    def test_alt_form_second_language_section(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:alt",
+            10,
+            """(''เลิกใช้'') <span class="Thai" lang="th">[[เดอร#ภาษาไทย|เดอร]]</span>, <span class="Thai" lang="th">[[เดิร#ภาษาไทย|เดิร]]</span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "ข้าว",
+            """== ภาษาไทย ==
+=== คำกริยา ===
+# [[ชื่อ]]
+
+== ภาษาญ้อ ==
+=== รูปแบบอื่น ===
+* {{l|nyw|เข้า}}
+=== คำนาม ===
+# [[ข้าว]]""",
+        )
+        self.assertTrue("forms" not in page_data[0])
+        self.assertEqual(page_data[1]["forms"], [{"form": "เข้า"}])
+
+    def test_alt_form_after_pos(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:lo-alt",
+            10,
+            """* (''ล้าสมัย'') <span class="Laoo" lang="lo">[[ໄທຍ໌#ภาษาลาว|ໄທຍ໌]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="lo-Latn" class="tr Latn">ไทย์</span><span class="mention-gloss-paren annotation-paren">)</span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "ໄທ",
+            """== ภาษาลาว ==
+=== คำนาม ===
+# [[ไทย]]
+
+=== คำวิสามานยนาม ===
+# [[ไทย]]
+==== รูปแบบอื่น ====
+{{lo-alt|d=ໄທຍ}}""",
+        )
+        self.assertTrue("forms" not in page_data[0])
+        self.assertEqual(
+            page_data[1]["forms"],
+            [{"form": "ໄທຍ໌", "raw_tags": ["ล้าสมัย"], "roman": "ไทย์"}],
+        )
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
@@ -97,3 +97,34 @@ def test_syn_template(self):
             page_data[0]["synonyms"],
             [{"word": "ทีวี"}, {"word": "โทรภาพ"}],
         )
+
+    def test_col3_zh_pinyin(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:col3",
+            10,
+            """<div><div><ul><li><span class="Hant" lang="zh">[[電腦遊戲#ภาษาจีน|電腦遊戲]]</span><span class="Zsym mention">&nbsp;/ </span><span class="Hans" lang="zh">[[电脑游戏#ภาษาจีน|电脑游戏]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="zh-Latn" class="tr Latn">diànnǎo yóuxì</span><span class="mention-gloss-paren annotation-paren">)</span></li></ul></div></div>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "電腦",
+            """== ภาษาจีน ==
+=== คำนาม ===
+# [[คอมพิวเตอร์]]
+==== ลูกคำ ====
+{{col3|zh|電腦遊戲}}""",
+        )
+        self.assertEqual(
+            page_data[0]["derived"],
+            [
+                {
+                    "word": "電腦遊戲",
+                    "roman": "diànnǎo yóuxì",
+                    "tags": ["Traditional Chinese"],
+                },
+                {
+                    "word": "电脑游戏",
+                    "roman": "diànnǎo yóuxì",
+                    "tags": ["Simplified Chinese"],
+                },
+            ],
+        )
diff --git a/tests/test_th_sound.py b/tests/test_th_sound.py
@@ -68,3 +68,39 @@ def test_th_pron(self):
                 "ศัพท์ภาษาไทยที่มี 1 พยางค์",
             ],
         )
+
+    def test_lo_pron(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:lo-pron",
+            10,
+            """* <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content"><span class="usage-label-accent">เวียงจันทน์</span></span><span class="ib-brac qualifier-brac">)</span> [[วิกิพจนานุกรม:สัทอักษรสากล|สัทอักษรสากล]]<sup>([[wikipedia:ระบบเสียงภาษาลาว|คำอธิบาย]])</sup>:&#32;<span class="IPA">[tʰaj˧˥]</span>[[Category:ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ|ໄທ]][[Category:ศัพท์ภาษาลาวที่มี 1 พยางค์|ໄທ]]
+* <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content"><span class="usage-label-accent">หลวงพระบาง</span></span><span class="ib-brac qualifier-brac">)</span> [[วิกิพจนานุกรม:สัทอักษรสากล|สัทอักษรสากล]]<sup>([[wikipedia:ระบบเสียงภาษาลาว|คำอธิบาย]])</sup>:&#32;<span class="IPA">[tʰaj˩˨]</span>[[Category:ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ|ໄທ]][[Category:ศัพท์ภาษาลาวที่มี 1 พยางค์|ໄທ]]
+* การแบ่งพยางค์: <span class='Laoo lo-reading' lang='lo'>ໄທ</span>
+* สัมผัส: [[:หมวดหมู่:สัมผัส:ภาษาลาว/aj|<span class="IPA">-aj</span>]][[Category:สัมผัส:ภาษาลาว/aj|ໄທ]]""",
+        )
+        data = parse_page(
+            self.wxr,
+            "ໄທ",
+            """== ภาษาลาว ==
+=== การออกเสียง ===
+{{lo-pron}}
+=== คำนาม ===
+# [[ไทย]]""",
+        )
+        self.assertEqual(data[0]["hyphenation"], ["ໄທ"])
+        self.assertEqual(
+            data[0]["sounds"],
+            [
+                {"ipa": "[tʰaj˧˥]", "raw_tags": ["เวียงจันทน์"]},
+                {"ipa": "[tʰaj˩˨]", "raw_tags": ["หลวงพระบาง"]},
+                {"rhymes": "-aj"},
+            ],
+        )
+        self.assertEqual(
+            data[0]["categories"],
+            [
+                "ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ",
+                "ศัพท์ภาษาลาวที่มี 1 พยางค์",
+                "สัมผัส:ภาษาลาว/aj",
+            ],
+        )