Skip to content

Commit

Permalink
Merge pull request #1008 from xxyzz/ku
Browse files Browse the repository at this point in the history
[ku] extract other forms sections
  • Loading branch information
xxyzz authored Jan 27, 2025
2 parents a938dcd + cadb098 commit 506d578
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 0 deletions.
59 changes: 59 additions & 0 deletions src/wiktextract/extractor/ku/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry


def extract_other_form_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: WikiNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name.startswith("ku-"):
extract_ku_form_template(wxr, word_entry, t_node)
elif t_node.template_name == "g":
extract_g_template(wxr, word_entry, t_node)


def extract_ku_form_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
form = Form(form="")
for index, span_tag in enumerate(expanded_node.find_html("span")):
if index == 0:
form.raw_tags.append(clean_node(wxr, None, span_tag))
elif index == 1:
form.form = clean_node(wxr, None, span_tag)
if form.form != "":
word_entry.forms.append(form)


def extract_g_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
) -> None:
form = Form(
form=clean_node(
wxr,
None,
t_node.template_parameters.get(
2, t_node.template_parameters.get("cuda", "")
),
),
roman=clean_node(wxr, None, t_node.template_parameters.get("tr", "")),
translation=clean_node(
wxr, None, t_node.template_parameters.get("w", "")
),
)
if form.form != "":
word_entry.forms.append(form)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ku/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Form(KurdishBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""
translation: str = ""


class Translation(KurdishBaseModel):
Expand Down
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/ku/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .linkage import extract_other_form_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
Expand All @@ -31,6 +32,10 @@ def parse_section(
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]:
extract_other_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
43 changes: 43 additions & 0 deletions tests/test_ku_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.ku.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestKuLinkage(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="ku"),
WiktionaryConfig(
dump_file_lang_code="ku", capture_language_codes=None
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_ku_ar(self):
self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî")
self.wxr.wtp.add_page(
"Şablon:ku-ar",
10,
"""<span class="Latn" lang="ku">[[kurdî-erebî#Kurmancî|kurdî-erebî]]</span>: <span class="Arab" lang="ku">[[کووچک#Kurmancî|کووچک]]</span>&lrm;""",
)
page_data = parse_page(
self.wxr,
"kûçik",
"""== {{ziman|ku}} ==
=== Navdêr ===
# [[heywan|Heywanek]]
==== Bi alfabeyên din ====
* {{ku-ar|کووچک}}""",
)
self.assertEqual(
page_data[0]["forms"],
[{"form": "کووچک", "raw_tags": ["kurdî-erebî"]}],
)

0 comments on commit 506d578

Please sign in to comment.