Skip to content

Commit

Permalink
Merge pull request #1010 from xxyzz/ku
Browse files Browse the repository at this point in the history
[ku] extract linkage section
  • Loading branch information
xxyzz authored Jan 28, 2025
2 parents 506d578 + d9a474b commit f3ca094
Show file tree
Hide file tree
Showing 6 changed files with 419 additions and 33 deletions.
267 changes: 238 additions & 29 deletions src/wiktextract/extractor/ku/linkage.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,20 @@
import re
from itertools import count

from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry


def extract_other_form_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: WikiNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name.startswith("ku-"):
extract_ku_form_template(wxr, word_entry, t_node)
elif t_node.template_name == "g":
extract_g_template(wxr, word_entry, t_node)
from .models import Form, Linkage, WordEntry
from .tags import translate_raw_tags


def extract_ku_form_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str = "",
sense: str = "",
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
Expand All @@ -34,26 +26,243 @@ def extract_ku_form_template(
elif index == 1:
form.form = clean_node(wxr, None, span_tag)
if form.form != "":
word_entry.forms.append(form)
if linkage_type == "":
word_entry.forms.append(form)
else:
getattr(word_entry, linkage_type).append(
Linkage(
word=form.form,
raw_tags=form.raw_tags,
sense=sense,
)
)


def extract_g_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str = "",
sense: str = "",
raw_tags: list[str] = [],
) -> None:
form = Form(
form=clean_node(
wxr,
None,
t_node.template_parameters.get(
2, t_node.template_parameters.get("cuda", "")
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for span_tag in expanded_node.find_html(
"span", attr_name="class", attr_value="gender"
):
for abbr_tag in span_tag.find_html("abbr"):
raw_tag = clean_node(wxr, None, abbr_tag)
if raw_tag not in ["", "?"]:
raw_tags.append(raw_tag)
if linkage_type == "":
form = Form(
form=clean_node(
wxr,
None,
t_node.template_parameters.get(
2, t_node.template_parameters.get("cuda", "")
),
),
roman=clean_node(
wxr, None, t_node.template_parameters.get("tr", "")
),
translation=clean_node(
wxr, None, t_node.template_parameters.get("w", "")
),
),
roman=clean_node(wxr, None, t_node.template_parameters.get("tr", "")),
translation=clean_node(
wxr, None, t_node.template_parameters.get("w", "")
),
raw_tags=raw_tags,
)
if form.form != "":
translate_raw_tags(form)
word_entry.forms.append(form)
else:
l_data = Linkage(
word=clean_node(
wxr,
None,
t_node.template_parameters.get(
2, t_node.template_parameters.get("cuda", "")
),
),
roman=clean_node(
wxr, None, t_node.template_parameters.get("tr", "")
),
translation=clean_node(
wxr, None, t_node.template_parameters.get("w", "")
),
sense=sense,
raw_tags=raw_tags,
)
if l_data.word != "":
translate_raw_tags(l_data)
getattr(word_entry, linkage_type).append(l_data)


def extract_hw_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str = "",
sense: str = "",
) -> None:
# https://ku.wiktionary.org/wiki/Şablon:hw
raw_tags = []
forms = []
for arg in count(5):
if arg not in t_node.template_parameters:
break
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
raw_tags.append(raw_tag)
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
if form.form != "":
word_entry.forms.append(form)
lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
for span_tag in expanded_node.find_html("span"):
span_lang = span_tag.attrs.get("lang", "")
if span_lang == lang_code:
form_str = clean_node(wxr, None, span_tag)
if form_str != "":
forms.append(Form(form=form_str, raw_tags=raw_tags))
elif span_lang.endswith("-Latn") and len(forms) > 0:
forms[-1].roman = clean_node(wxr, None, span_tag)

if linkage_type == "":
word_entry.forms.extend(forms)
else:
getattr(word_entry, linkage_type).extend(
[
Linkage(
word=f.form,
roman=f.roman,
sense=sense,
raw_tags=f.raw_tags,
)
for f in forms
]
)


def extract_linkage_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: WikiNode,
linkage_type: str,
) -> None:
for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
if (
isinstance(node, TemplateNode)
and re.fullmatch(r"kol(?:\d+)?", node.template_name) is not None
):
extract_kol_template(wxr, word_entry, node, linkage_type)
elif isinstance(node, TemplateNode) and node.template_name == "stûn":
extract_stûn_template(wxr, word_entry, node, linkage_type)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_linkage_list_item(
wxr, word_entry, list_item, linkage_type, ""
)


def extract_kol_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
) -> None:
# https://ku.wiktionary.org/wiki/Şablon:kol
sense = clean_node(wxr, None, t_node.template_parameters.get("sernav", ""))
for arg in count(3 if t_node.template_name == "kol" else 2):
if arg not in t_node.template_parameters:
break
arg_value = t_node.template_parameters[arg]
if isinstance(arg_value, str):
if arg_value.strip() != "":
if linkage_type != "":
getattr(word_entry, linkage_type).append(
Linkage(word=arg_value.strip(), sense=sense)
)
else:
word_entry.forms.append(Form(form=arg_value.strip()))
else:
if not isinstance(arg_value, list):
arg_value = [arg_value]
if (
len(arg_value) > 0
and isinstance(arg_value[0], str)
and arg_value[0].strip() == ""
):
arg_value.pop(0) # not preformatted node
arg_value_node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value))
extract_linkage_list_item(
wxr, word_entry, arg_value_node, linkage_type, sense
)


def extract_linkage_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
linkage_type: str,
sense: str,
) -> None:
raw_tags = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
if linkage_type != "":
l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags)
translate_raw_tags(l_data)
getattr(word_entry, linkage_type).append(l_data)
else:
form = Form(form=word, raw_tags=raw_tags)
translate_raw_tags(form)
word_entry.forms.append(form)
elif isinstance(node, TemplateNode):
if node.template_name == "g":
extract_g_template(
wxr,
word_entry,
node,
linkage_type=linkage_type,
raw_tags=raw_tags,
)
elif node.template_name.startswith("ku-"):
extract_ku_form_template(
wxr,
word_entry,
node,
linkage_type=linkage_type,
sense=sense,
)
elif node.template_name in ["herwiha", "hw"]:
extract_hw_template(
wxr,
word_entry,
node,
linkage_type=linkage_type,
sense=sense,
)
elif node.template_name == "mj":
raw_tag = clean_node(wxr, None, node).strip("() ")
if raw_tag != "":
raw_tags.append(raw_tag)


def extract_stûn_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
) -> None:
first_arg = t_node.template_parameters.get(1)
if first_arg is None:
return
first_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(first_arg))
for list_node in first_arg.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_linkage_list_item(
wxr, word_entry, list_item, linkage_type, ""
)
18 changes: 18 additions & 0 deletions src/wiktextract/extractor/ku/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class Form(KurdishBaseModel):
raw_tags: list[str] = []
roman: str = ""
translation: str = ""
sense: str = ""


class Translation(KurdishBaseModel):
Expand All @@ -56,6 +57,15 @@ class Translation(KurdishBaseModel):
source: str = ""


class Linkage(KurdishBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""
translation: str = ""
sense: str = ""


class WordEntry(KurdishBaseModel):
model_config = ConfigDict(title="Kurdish Wiktionary")
word: str = Field(description="Word string")
Expand All @@ -70,3 +80,11 @@ class WordEntry(KurdishBaseModel):
forms: list[Form] = []
etymology_text: str = ""
translations: list[Translation] = []
synonyms: list[Linkage] = []
antonyms: list[Linkage] = []
derived: list[Linkage] = []
related: list[Linkage] = []
hypernyms: list[Linkage] = []
hyponyms: list[Linkage] = []
anagrams: list[Linkage] = []
rhymes: list[Linkage] = []
18 changes: 14 additions & 4 deletions src/wiktextract/extractor/ku/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .linkage import extract_other_form_section
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section, is_translation_page


Expand All @@ -33,8 +33,18 @@ def parse_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]:
extract_other_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
"",
)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
Expand Down
13 changes: 13 additions & 0 deletions src/wiktextract/extractor/ku/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,16 @@
"Formeke cînavê": {"pos": "pron", "tags": ["form-of"]},
"Formeke hokerê": {"pos": "adv", "tags": ["form-of"]},
}

LINKAGE_SECTIONS = {
"Hevmane": "synonyms",
"Dijmane": "antonyms",
"Jê": "derived",
"Nêzîk": "synonyms",
"Têkildar": "related",
"Jornav": "hypernyms",
"Jêrnav": "hyponyms",
"Anagram": "anagrams",
"Binêre herwiha": "related",
"Qafiye": "rhymes",
}
Loading

0 comments on commit f3ca094

Please sign in to comment.