Skip to content

Commit

Permalink
Merge pull request #1001 from xxyzz/th
Browse files Browse the repository at this point in the history
[th] improve linkage, sound, alt form, translation section code
  • Loading branch information
xxyzz authored Jan 21, 2025
2 parents c47ebf4 + 3649fb6 commit 0c0c1f1
Show file tree
Hide file tree
Showing 11 changed files with 251 additions and 15 deletions.
20 changes: 18 additions & 2 deletions src/wiktextract/extractor/th/alt_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ def extract_alt_form_section(
and node.template_name == "alt"
):
extract_alt_template(wxr, word_entry, node)
elif isinstance(node, TemplateNode) and node.template_name in [
"l",
"link",
]:
extract_l_template(wxr, word_entry, node)

for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "lo-alt":
Expand Down Expand Up @@ -52,8 +57,9 @@ def extract_alt_expanded_nodes(
span_lang = span_tag.attrs.get("lang", "")
if span_lang == lang_code:
form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
translate_raw_tags(form)
word_entry.forms.append(form)
if form.form != "":
translate_raw_tags(form)
word_entry.forms.append(form)
elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0:
word_entry.forms[-1].roman = clean_node(wxr, None, span_tag)

Expand All @@ -69,3 +75,13 @@ def extract_lo_alt_template(
for list_node in expanded_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo")


def extract_l_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
form = Form(
form=clean_node(wxr, None, t_node.template_parameters.get(2, ""))
)
if form.form != "":
word_entry.forms.append(form)
16 changes: 11 additions & 5 deletions src/wiktextract/extractor/th/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,21 @@ def extract_col_template(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for li_tag in expanded_node.find_html_recursively("li"):
l_data = Linkage(word="", source=source)
l_data = []
for span_tag in li_tag.find_html("span"):
span_class = span_tag.attrs.get("class", "")
if "Latn" in span_class:
l_data.roman = clean_node(wxr, None, span_tag)
for data in l_data:
data.roman = clean_node(wxr, None, span_tag)
elif "lang" in span_tag.attrs:
l_data.word = clean_node(wxr, None, span_tag)
if l_data.word != "":
getattr(word_entry, linkage_type).append(l_data)
word = clean_node(wxr, None, span_tag)
if word != "":
l_data.append(Linkage(word=word, source=source))
if span_class == "Hant":
l_data[-1].tags.append("Traditional Chinese")
elif span_class == "Hans":
l_data[-1].tags.append("Simplified Chinese")
getattr(word_entry, linkage_type).extend(l_data)


def extract_linkage_lite_item(
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ class Sound(ThaiBaseModel):
homophone: str = ""
other: str = ""
roman: str = ""
rhymes: str = ""


class WordEntry(ThaiBaseModel):
Expand Down Expand Up @@ -126,3 +127,4 @@ class WordEntry(ThaiBaseModel):
idioms: list[Linkage] = []
coordinate_terms: list[Linkage] = []
sounds: list[Sound] = []
hyphenation: list[str] = []
12 changes: 11 additions & 1 deletion src/wiktextract/extractor/th/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ def parse_section(
extract_sound_section(wxr, base_data, level_node)
elif title_text == "รูปแบบอื่น":
extract_alt_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr,
page_data[-1]
if len(page_data) > 0
and page_data[-1].lang_code == base_data.lang_code
and page_data[-1].pos == base_data.pos
else base_data,
level_node,
)
elif title_text == "การใช้":
extract_note_section(
Expand All @@ -69,6 +75,10 @@ def parse_page(
) -> list[dict[str, Any]]:
# page layout
# https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน

# skip translation pages
if page_title.endswith("/คำแปลภาษาอื่น"):
return []
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def extract_pos_section(
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
base_data.pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))

gloss_list_index = len(level_node.children)
Expand Down
58 changes: 57 additions & 1 deletion src/wiktextract/extractor/th/sound.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import re
from dataclasses import dataclass

from wikitextprocessor import LevelNode, NodeKind, TemplateNode
from wikitextprocessor import (
HTMLNode,
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand All @@ -18,6 +24,8 @@ def extract_sound_section(
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "th-pron":
extract_th_pron_template(wxr, base_data, t_node)
elif t_node.template_name == "lo-pron":
extract_lo_pron_template(wxr, base_data, t_node)


@dataclass
Expand Down Expand Up @@ -86,3 +94,51 @@ def extract_th_pron_template(
base_data.sounds.append(sound)

clean_node(wxr, base_data, expanded_node)


def extract_lo_pron_template(
wxr: WiktextractContext,
base_data: WordEntry,
t_node: TemplateNode,
) -> None:
# https://th.wiktionary.org/wiki/แม่แบบ:lo-pron
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for list_node in expanded_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
field = "other"
raw_tag = ""
for node in list_item.children:
if isinstance(node, HTMLNode) and node.tag == "span":
span_class = node.attrs.get("class", "")
if "qualifier-content" in span_class:
raw_tag = clean_node(wxr, None, node)
elif span_class == "IPA":
ipa = clean_node(wxr, None, node)
if ipa != "":
sound = Sound(ipa=ipa)
if raw_tag != "":
sound.raw_tags.append(raw_tag)
translate_raw_tags(sound)
base_data.sounds.append(sound)
else:
span_lang = node.attrs.get("lang", "")
if span_lang == "lo" and field == "hyphenation":
span_str = clean_node(wxr, None, node)
if span_str != "":
base_data.hyphenation.append(span_str)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
link_str = clean_node(wxr, None, node)
if link_str == "สัทอักษรสากล":
field = "ipa"
elif link_str != "" and field == "rhymes":
base_data.sounds.append(Sound(rhymes=link_str))
elif isinstance(node, str) and node.strip().endswith(":"):
node = node.strip()
if node == "การแบ่งพยางค์:":
field = "hyphenation"
elif node == "สัมผัส:":
field = "rhymes"

clean_node(wxr, base_data, expanded_node)
38 changes: 35 additions & 3 deletions src/wiktextract/extractor/th/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,16 @@ def extract_translation_list_item(
clean_node(wxr, None, list_item.children[:index])
+ node[: node.index(":")].strip()
)
lang_code = name_to_code(lang_name, "th")
if lang_code == "":
lang_code = "unknown"
if lang_name == "":
lang_name = "unknown"
if lang_name != "unknown":
lang_code = name_to_code(lang_name, "th")
if lang_code == "":
lang_code = "unknown"
elif isinstance(node, TemplateNode) and node.template_name in [
"t",
"t+",
"t-simple",
]:
extract_t_template(wxr, word_entry, node, lang_name, sense)
elif (
Expand All @@ -62,6 +66,11 @@ def extract_translation_list_item(
extract_translation_list_item(
wxr, word_entry, child_list_item, sense
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
for link_node in node.find_child(NodeKind.LINK):
link_str = clean_node(wxr, None, link_node)
if link_str.endswith("/คำแปลภาษาอื่น"):
extract_translation_page(wxr, word_entry, link_str)


def extract_t_template(
Expand Down Expand Up @@ -99,3 +108,26 @@ def extract_t_template(
word_entry.translations.append(tr_data)
for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, word_entry, link_node)


def extract_translation_page(
wxr: WiktextractContext,
word_entry: WordEntry,
page_title: str,
) -> None:
page = wxr.wtp.get_page(page_title, 0)
if page is None or page.body is None:
return
root = wxr.wtp.parse(page.body)
for level2_node in root.find_child(NodeKind.LEVEL2):
lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
"ภาษา"
)
if lang_name != word_entry.lang:
continue
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
pos_title = clean_node(wxr, None, level3_node.largs)
if pos_title != word_entry.pos_title:
continue
for tr_level_node in level3_node.find_child(NodeKind.LEVEL4):
extract_translation_section(wxr, word_entry, tr_level_node)
6 changes: 3 additions & 3 deletions src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def check_str_fields(
"mandatory field): {}".format(
field,
"" if empty_ok else " non-empty",
json.dumps(item, sort_keys=True),
json.dumps(item, sort_keys=True, ensure_ascii=False),
),
)
continue
Expand All @@ -315,7 +315,7 @@ def check_str_fields(
"{!r} should be a{} string: {}".format(
field,
"" if empty_ok else " non-empty",
json.dumps(item, sort_keys=True),
json.dumps(item, sort_keys=True, ensure_ascii=False),
),
)
if not v and not empty_ok:
Expand All @@ -326,7 +326,7 @@ def check_str_fields(
lang,
pos,
"{!r} should contain a non-empty string: {}".format(
field, json.dumps(item, sort_keys=True)
field, json.dumps(item, sort_keys=True, ensure_ascii=False)
),
)

Expand Down
46 changes: 46 additions & 0 deletions tests/test_th_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,49 @@ def test_alt_form_template(self):
"tags": ["form-of"],
},
)

def test_alt_form_second_language_section(self):
self.wxr.wtp.add_page(
"แม่แบบ:alt",
10,
"""(''เลิกใช้'') <span class="Thai" lang="th">[[เดอร#ภาษาไทย|เดอร]]</span>, <span class="Thai" lang="th">[[เดิร#ภาษาไทย|เดิร]]</span>""",
)
page_data = parse_page(
self.wxr,
"ข้าว",
"""== ภาษาไทย ==
=== คำกริยา ===
# [[ชื่อ]]
== ภาษาญ้อ ==
=== รูปแบบอื่น ===
* {{l|nyw|เข้า}}
=== คำนาม ===
# [[ข้าว]]""",
)
self.assertTrue("forms" not in page_data[0])
self.assertEqual(page_data[1]["forms"], [{"form": "เข้า"}])

def test_alt_form_after_pos(self):
self.wxr.wtp.add_page(
"แม่แบบ:lo-alt",
10,
"""* (''ล้าสมัย'') <span class="Laoo" lang="lo">[[ໄທຍ໌#ภาษาลาว|ໄທຍ໌]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="lo-Latn" class="tr Latn">ไทย์</span><span class="mention-gloss-paren annotation-paren">)</span>""",
)
page_data = parse_page(
self.wxr,
"ໄທ",
"""== ภาษาลาว ==
=== คำนาม ===
# [[ไทย]]
=== คำวิสามานยนาม ===
# [[ไทย]]
==== รูปแบบอื่น ====
{{lo-alt|d=ໄທຍ}}""",
)
self.assertTrue("forms" not in page_data[0])
self.assertEqual(
page_data[1]["forms"],
[{"form": "ໄທຍ໌", "raw_tags": ["ล้าสมัย"], "roman": "ไทย์"}],
)
31 changes: 31 additions & 0 deletions tests/test_th_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,34 @@ def test_syn_template(self):
page_data[0]["synonyms"],
[{"word": "ทีวี"}, {"word": "โทรภาพ"}],
)

def test_col3_zh_pinyin(self):
self.wxr.wtp.add_page(
"แม่แบบ:col3",
10,
"""<div><div><ul><li><span class="Hant" lang="zh">[[電腦遊戲#ภาษาจีน|電腦遊戲]]</span><span class="Zsym mention">&nbsp;/ </span><span class="Hans" lang="zh">[[电脑游戏#ภาษาจีน|电脑游戏]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="zh-Latn" class="tr Latn">diànnǎo yóuxì</span><span class="mention-gloss-paren annotation-paren">)</span></li></ul></div></div>""",
)
page_data = parse_page(
self.wxr,
"電腦",
"""== ภาษาจีน ==
=== คำนาม ===
# [[คอมพิวเตอร์]]
==== ลูกคำ ====
{{col3|zh|電腦遊戲}}""",
)
self.assertEqual(
page_data[0]["derived"],
[
{
"word": "電腦遊戲",
"roman": "diànnǎo yóuxì",
"tags": ["Traditional Chinese"],
},
{
"word": "电脑游戏",
"roman": "diànnǎo yóuxì",
"tags": ["Simplified Chinese"],
},
],
)
36 changes: 36 additions & 0 deletions tests/test_th_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,39 @@ def test_th_pron(self):
"ศัพท์ภาษาไทยที่มี 1 พยางค์",
],
)

def test_lo_pron(self):
self.wxr.wtp.add_page(
"แม่แบบ:lo-pron",
10,
"""* <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content"><span class="usage-label-accent">เวียงจันทน์</span></span><span class="ib-brac qualifier-brac">)</span> [[วิกิพจนานุกรม:สัทอักษรสากล|สัทอักษรสากล]]<sup>([[wikipedia:ระบบเสียงภาษาลาว|คำอธิบาย]])</sup>:&#32;<span class="IPA">[tʰaj˧˥]</span>[[Category:ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ|ໄທ]][[Category:ศัพท์ภาษาลาวที่มี 1 พยางค์|ໄທ]]
* <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content"><span class="usage-label-accent">หลวงพระบาง</span></span><span class="ib-brac qualifier-brac">)</span> [[วิกิพจนานุกรม:สัทอักษรสากล|สัทอักษรสากล]]<sup>([[wikipedia:ระบบเสียงภาษาลาว|คำอธิบาย]])</sup>:&#32;<span class="IPA">[tʰaj˩˨]</span>[[Category:ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ|ໄທ]][[Category:ศัพท์ภาษาลาวที่มี 1 พยางค์|ໄທ]]
* การแบ่งพยางค์: <span class='Laoo lo-reading' lang='lo'>ໄທ</span>
* สัมผัส: [[:หมวดหมู่:สัมผัส:ภาษาลาว/aj|<span class="IPA">-aj</span>]][[Category:สัมผัส:ภาษาลาว/aj|ໄທ]]""",
)
data = parse_page(
self.wxr,
"ໄທ",
"""== ภาษาลาว ==
=== การออกเสียง ===
{{lo-pron}}
=== คำนาม ===
# [[ไทย]]""",
)
self.assertEqual(data[0]["hyphenation"], ["ໄທ"])
self.assertEqual(
data[0]["sounds"],
[
{"ipa": "[tʰaj˧˥]", "raw_tags": ["เวียงจันทน์"]},
{"ipa": "[tʰaj˩˨]", "raw_tags": ["หลวงพระบาง"]},
{"rhymes": "-aj"},
],
)
self.assertEqual(
data[0]["categories"],
[
"ศัพท์ภาษาลาวที่มีการออกเสียงไอพีเอ",
"ศัพท์ภาษาลาวที่มี 1 พยางค์",
"สัมผัส:ภาษาลาว/aj",
],
)

0 comments on commit 0c0c1f1

Please sign in to comment.