From e3767541d7453190d78c517b0d1f0aaa3ddb6ab3 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 28 Jan 2025 10:37:37 +0800 Subject: [PATCH 1/4] [ku] extract "hw" form template and link node in lists --- src/wiktextract/extractor/ku/linkage.py | 49 ++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/ku/linkage.py b/src/wiktextract/extractor/ku/linkage.py index d7e802fb..7f100a5c 100644 --- a/src/wiktextract/extractor/ku/linkage.py +++ b/src/wiktextract/extractor/ku/linkage.py @@ -1,3 +1,5 @@ +from itertools import count + from wikitextprocessor import NodeKind, TemplateNode, WikiNode from ...page import clean_node @@ -12,11 +14,18 @@ def extract_other_form_section( ) -> None: for list_node in level_node.find_child(NodeKind.LIST): for list_item in list_node.find_child(NodeKind.LIST_ITEM): - for t_node in list_item.find_child(NodeKind.TEMPLATE): - if t_node.template_name.startswith("ku-"): - extract_ku_form_template(wxr, word_entry, t_node) - elif t_node.template_name == "g": - extract_g_template(wxr, word_entry, t_node) + for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK): + if isinstance(node, TemplateNode): + if node.template_name.startswith("ku-"): + extract_ku_form_template(wxr, word_entry, node) + elif node.template_name == "g": + extract_g_template(wxr, word_entry, node) + elif node.template_name in ["herwiha", "hw"]: + extract_hw_template(wxr, word_entry, node) + elif node.kind == NodeKind.LINK: + form = clean_node(wxr, None, node) + if form != "": + word_entry.forms.append(Form(form=form)) def extract_ku_form_template( @@ -57,3 +66,33 @@ def extract_g_template( ) if form.form != "": word_entry.forms.append(form) + + +def extract_hw_template( + wxr: WiktextractContext, + word_entry: WordEntry, + t_node: TemplateNode, +) -> None: + # https://ku.wiktionary.org/wiki/Şablon:hw + raw_tags = [] + forms = [] + for arg in count(5): + if arg not in t_node.template_parameters: + break + raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) + if raw_tag != "": + raw_tags.append(raw_tag) + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + for span_tag in expanded_node.find_html("span"): + span_lang = span_tag.attrs.get("lang", "") + if span_lang == lang_code: + form_str = clean_node(wxr, None, span_tag) + if form_str != "": + forms.append(Form(form=form_str, raw_tags=raw_tags)) + elif span_lang.endswith("-Latn") and len(forms) > 0: + forms[-1].roman = clean_node(wxr, None, span_tag) + + word_entry.forms.extend(forms) From 5e592577c1cb6aab47646a6686389180340c414d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 28 Jan 2025 12:33:32 +0800 Subject: [PATCH 2/4] [ku] extract "kol*" template in linkage sections --- src/wiktextract/extractor/ku/linkage.py | 126 +++++++++++++++--- src/wiktextract/extractor/ku/models.py | 17 +++ src/wiktextract/extractor/ku/page.py | 11 +- .../extractor/ku/section_titles.py | 13 ++ tests/test_ku_linkage.py | 15 +++ 5 files changed, 163 insertions(+), 19 deletions(-) diff --git a/src/wiktextract/extractor/ku/linkage.py b/src/wiktextract/extractor/ku/linkage.py index 7f100a5c..b4a543f3 100644 --- a/src/wiktextract/extractor/ku/linkage.py +++ b/src/wiktextract/extractor/ku/linkage.py @@ -1,10 +1,11 @@ +import re from itertools import count from wikitextprocessor import NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext -from .models import Form, WordEntry +from .models import Form, Linkage, WordEntry def extract_other_form_section( @@ -32,6 +33,8 @@ def extract_ku_form_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode, + linkage_type: str = "", + sense: str = "", ) -> None: expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(t_node), expand_all=True @@ -43,35 +46,70 @@ def extract_ku_form_template( elif index == 1: form.form = clean_node(wxr, None, span_tag) if form.form != "": - word_entry.forms.append(form) + if linkage_type == "": + word_entry.forms.append(form) + else: + getattr(word_entry, linkage_type).append( + Linkage( + word=form.form, + raw_tags=form.raw_tags, + sense=sense, + ) + ) def extract_g_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode, + linkage_type: str = "", + sense: str = "", ) -> None: - form = Form( - form=clean_node( - wxr, - None, - t_node.template_parameters.get( - 2, t_node.template_parameters.get("cuda", "") + if linkage_type == "": + form = Form( + form=clean_node( + wxr, + None, + t_node.template_parameters.get( + 2, t_node.template_parameters.get("cuda", "") + ), ), - ), - roman=clean_node(wxr, None, t_node.template_parameters.get("tr", "")), - translation=clean_node( - wxr, None, t_node.template_parameters.get("w", "") - ), - ) - if form.form != "": - word_entry.forms.append(form) + roman=clean_node( + wxr, None, t_node.template_parameters.get("tr", "") + ), + translation=clean_node( + wxr, None, t_node.template_parameters.get("w", "") + ), + ) + if form.form != "": + word_entry.forms.append(form) + else: + l_data = Linkage( + word=clean_node( + wxr, + None, + t_node.template_parameters.get( + 2, t_node.template_parameters.get("cuda", "") + ), + ), + roman=clean_node( + wxr, None, t_node.template_parameters.get("tr", "") + ), + translation=clean_node( + wxr, None, t_node.template_parameters.get("w", "") + ), + sense=sense, + ) + if l_data.word != "": + getattr(word_entry, linkage_type).append(l_data) def extract_hw_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode, + linkage_type: str = "", + sense: str = "", ) -> None: # https://ku.wiktionary.org/wiki/Şablon:hw raw_tags = [] @@ -95,4 +133,58 @@ def extract_hw_template( elif span_lang.endswith("-Latn") and len(forms) > 0: forms[-1].roman = clean_node(wxr, None, span_tag) - word_entry.forms.extend(forms) + if linkage_type == "": + word_entry.forms.extend(forms) + else: + getattr(word_entry, linkage_type).extend( + [ + Linkage( + word=f.form, + roman=f.roman, + sense=sense, + raw_tags=f.raw_tags, + ) + for f in forms + ] + ) + + +def extract_linkage_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: WikiNode, + linkage_type: str, +) -> None: + for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): + if ( + isinstance(node, TemplateNode) + and re.fullmatch(r"kol(?:\d+)?", node.template_name) is not None + ): + extract_kol_template(wxr, word_entry, node, linkage_type) + elif isinstance(node, TemplateNode) and node.template_name == "stûn": + pass + elif node.kind == NodeKind.LIST: + pass + + +def extract_kol_template( + wxr: WiktextractContext, + word_entry: WordEntry, + t_node: TemplateNode, + linkage_type: str, +) -> None: + # https://ku.wiktionary.org/wiki/Şablon:kol + sense = clean_node(wxr, None, t_node.template_parameters.get("sernav", "")) + for arg in count(3 if t_node.template_name == "kol" else 2): + if arg not in t_node.template_parameters: + break + arg_value = t_node.template_parameters[arg] + if isinstance(arg_value, str) and arg_value.strip() != "": + getattr(word_entry, linkage_type).append( + Linkage(word=arg_value.strip(), sense=sense) + ) + elif ( + isinstance(arg_value, TemplateNode) + and arg_value.template_name == "g" + ): + extract_g_template(wxr, word_entry, arg_value, linkage_type) diff --git a/src/wiktextract/extractor/ku/models.py b/src/wiktextract/extractor/ku/models.py index c331aa8b..a3a29c31 100644 --- a/src/wiktextract/extractor/ku/models.py +++ b/src/wiktextract/extractor/ku/models.py @@ -56,6 +56,15 @@ class Translation(KurdishBaseModel): source: str = "" +class Linkage(KurdishBaseModel): + word: str + tags: list[str] = [] + raw_tags: list[str] = [] + roman: str = "" + translation: str = "" + sense: str = "" + + class WordEntry(KurdishBaseModel): model_config = ConfigDict(title="Kurdish Wiktionary") word: str = Field(description="Word string") @@ -70,3 +79,11 @@ class WordEntry(KurdishBaseModel): forms: list[Form] = [] etymology_text: str = "" translations: list[Translation] = [] + synonyms: list[Linkage] = [] + antonyms: list[Linkage] = [] + derived: list[Linkage] = [] + related: list[Linkage] = [] + hypernyms: list[Linkage] = [] + hyponyms: list[Linkage] = [] + anagrams: list[Linkage] = [] + rhymes: list[Linkage] = [] diff --git a/src/wiktextract/extractor/ku/page.py b/src/wiktextract/extractor/ku/page.py index 535d5255..5b72f80a 100644 --- a/src/wiktextract/extractor/ku/page.py +++ b/src/wiktextract/extractor/ku/page.py @@ -6,10 +6,10 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_etymology_section -from .linkage import extract_other_form_section +from .linkage import extract_linkage_section, extract_other_form_section from .models import Sense, WordEntry from .pos import extract_pos_section -from .section_titles import POS_DATA +from .section_titles import LINKAGE_SECTIONS, POS_DATA from .translation import extract_translation_section, is_translation_page @@ -36,6 +36,13 @@ def parse_section( extract_other_form_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) + elif title_text in LINKAGE_SECTIONS: + extract_linkage_section( + wxr, + page_data[-1] if len(page_data) > 0 else base_data, + level_node, + LINKAGE_SECTIONS[title_text], + ) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/ku/section_titles.py b/src/wiktextract/extractor/ku/section_titles.py index aec902ce..d85010f9 100644 --- a/src/wiktextract/extractor/ku/section_titles.py +++ b/src/wiktextract/extractor/ku/section_titles.py @@ -32,3 +32,16 @@ "Formeke cînavê": {"pos": "pron", "tags": ["form-of"]}, "Formeke hokerê": {"pos": "adv", "tags": ["form-of"]}, } + +LINKAGE_SECTIONS = { + "Hevmane": "synonyms", + "Dijmane": "antonyms", + "Jê": "derived", + "Nêzîk": "synonyms", + "Têkildar": "related", + "Jornav": "hypernyms", + "Jêrnav": "hyponyms", + "Anagram": "anagrams", + "Binêre herwiha": "related", + "Qafiye": "rhymes", +} diff --git a/tests/test_ku_linkage.py b/tests/test_ku_linkage.py index 56e5e178..193c975c 100644 --- a/tests/test_ku_linkage.py +++ b/tests/test_ku_linkage.py @@ -41,3 +41,18 @@ def test_ku_ar(self): page_data[0]["forms"], [{"form": "کووچک", "raw_tags": ["kurdî-erebî"]}], ) + + def test_kol_text(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + page_data = parse_page( + self.wxr, + "av", + """== {{ziman|ku}} == +=== Navdêr === +# [[vexwarin|Vexwarin]]a bê[[reng]] +==== Jê ==== +{{kol3|ku|cure=Jê +|kêmav +}}""", + ) + self.assertEqual(page_data[0]["derived"], [{"word": "kêmav"}]) From 6084e4b177a4a06a69758f2b2ff443e78716a192 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 28 Jan 2025 16:55:38 +0800 Subject: [PATCH 3/4] [ku] combine other form and linkage section code --- src/wiktextract/extractor/ku/linkage.py | 111 +++++++++++++++++------- src/wiktextract/extractor/ku/models.py | 1 + src/wiktextract/extractor/ku/page.py | 9 +- tests/test_ku_linkage.py | 50 +++++++++++ 4 files changed, 137 insertions(+), 34 deletions(-) diff --git a/src/wiktextract/extractor/ku/linkage.py b/src/wiktextract/extractor/ku/linkage.py index b4a543f3..60074d8a 100644 --- a/src/wiktextract/extractor/ku/linkage.py +++ b/src/wiktextract/extractor/ku/linkage.py @@ -8,27 +8,6 @@ from .models import Form, Linkage, WordEntry -def extract_other_form_section( - wxr: WiktextractContext, - word_entry: WordEntry, - level_node: WikiNode, -) -> None: - for list_node in level_node.find_child(NodeKind.LIST): - for list_item in list_node.find_child(NodeKind.LIST_ITEM): - for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK): - if isinstance(node, TemplateNode): - if node.template_name.startswith("ku-"): - extract_ku_form_template(wxr, word_entry, node) - elif node.template_name == "g": - extract_g_template(wxr, word_entry, node) - elif node.template_name in ["herwiha", "hw"]: - extract_hw_template(wxr, word_entry, node) - elif node.kind == NodeKind.LINK: - form = clean_node(wxr, None, node) - if form != "": - word_entry.forms.append(Form(form=form)) - - def extract_ku_form_template( wxr: WiktextractContext, word_entry: WordEntry, @@ -162,9 +141,12 @@ def extract_linkage_section( ): extract_kol_template(wxr, word_entry, node, linkage_type) elif isinstance(node, TemplateNode) and node.template_name == "stûn": - pass + extract_stûn_template(wxr, word_entry, node, linkage_type) elif node.kind == NodeKind.LIST: - pass + for list_item in node.find_child(NodeKind.LIST_ITEM): + extract_linkage_list_item( + wxr, word_entry, list_item, linkage_type, "" + ) def extract_kol_template( @@ -179,12 +161,79 @@ def extract_kol_template( if arg not in t_node.template_parameters: break arg_value = t_node.template_parameters[arg] - if isinstance(arg_value, str) and arg_value.strip() != "": - getattr(word_entry, linkage_type).append( - Linkage(word=arg_value.strip(), sense=sense) + if isinstance(arg_value, str): + if arg_value.strip() != "": + if linkage_type != "": + getattr(word_entry, linkage_type).append( + Linkage(word=arg_value.strip(), sense=sense) + ) + else: + word_entry.forms.append(Form(form=arg_value.strip())) + else: + if not isinstance(arg_value, list): + arg_value = [arg_value] + if ( + len(arg_value) > 0 + and isinstance(arg_value[0], str) + and arg_value[0].strip() == "" + ): + arg_value.pop(0) # not preformatted node + arg_value_node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value)) + extract_linkage_list_item( + wxr, word_entry, arg_value_node, linkage_type, sense + ) + + +def extract_linkage_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, + linkage_type: str, + sense: str, +) -> None: + for node in list_item.children: + if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + word = clean_node(wxr, None, node) + if word != "": + if linkage_type != "": + getattr(word_entry, linkage_type).append( + Linkage(word=word, sense=sense) + ) + else: + word_entry.forms.append(Form(form=word)) + elif isinstance(node, TemplateNode): + if node.template_name == "g": + extract_g_template(wxr, word_entry, node, linkage_type) + elif node.template_name.startswith("ku-"): + extract_ku_form_template( + wxr, + word_entry, + node, + linkage_type=linkage_type, + sense=sense, + ) + elif node.template_name in ["herwiha", "hw"]: + extract_hw_template( + wxr, + word_entry, + node, + linkage_type=linkage_type, + sense=sense, + ) + + +def extract_stûn_template( + wxr: WiktextractContext, + word_entry: WordEntry, + t_node: TemplateNode, + linkage_type: str, +) -> None: + first_arg = t_node.template_parameters.get(1) + if first_arg is None: + return + first_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(first_arg)) + for list_node in first_arg.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_linkage_list_item( + wxr, word_entry, list_item, linkage_type, "" ) - elif ( - isinstance(arg_value, TemplateNode) - and arg_value.template_name == "g" - ): - extract_g_template(wxr, word_entry, arg_value, linkage_type) diff --git a/src/wiktextract/extractor/ku/models.py b/src/wiktextract/extractor/ku/models.py index a3a29c31..07f1d0fc 100644 --- a/src/wiktextract/extractor/ku/models.py +++ b/src/wiktextract/extractor/ku/models.py @@ -38,6 +38,7 @@ class Form(KurdishBaseModel): raw_tags: list[str] = [] roman: str = "" translation: str = "" + sense: str = "" class Translation(KurdishBaseModel): diff --git a/src/wiktextract/extractor/ku/page.py b/src/wiktextract/extractor/ku/page.py index 5b72f80a..cb92bee3 100644 --- a/src/wiktextract/extractor/ku/page.py +++ b/src/wiktextract/extractor/ku/page.py @@ -6,7 +6,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_etymology_section -from .linkage import extract_linkage_section, extract_other_form_section +from .linkage import extract_linkage_section from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import LINKAGE_SECTIONS, POS_DATA @@ -33,8 +33,11 @@ def parse_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]: - extract_other_form_section( - wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + extract_linkage_section( + wxr, + page_data[-1] if len(page_data) > 0 else base_data, + level_node, + "", ) elif title_text in LINKAGE_SECTIONS: extract_linkage_section( diff --git a/tests/test_ku_linkage.py b/tests/test_ku_linkage.py index 193c975c..02dec30b 100644 --- a/tests/test_ku_linkage.py +++ b/tests/test_ku_linkage.py @@ -56,3 +56,53 @@ def test_kol_text(self): }}""", ) self.assertEqual(page_data[0]["derived"], [{"word": "kêmav"}]) + + def test_stûn_link(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + page_data = parse_page( + self.wxr, + "se", + """== {{ziman|ku}} == +=== Navdêr === +# [[ajal|Ajal]]ek +==== Hevmane ==== +{{stûn| +* [[kûçik]] +}}""", + ) + self.assertEqual(page_data[0]["synonyms"], [{"word": "kûçik"}]) + + def test_stûn_g(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + page_data = parse_page( + self.wxr, + "dar", + """== {{ziman|ku}} == +=== Navdêr 1 === +# [[riwek|Riwek]]eke +==== Hevmane ==== +{{stûn| +* [[kûçik]] +}}""", + ) + self.assertEqual(page_data[0]["synonyms"], [{"word": "kûçik"}]) + + def test_kol_hw(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + self.wxr.wtp.add_page( + "Şablon:hw", + 10, + """[[pisik#Kurmancî|pisik]] – ''[[w:Reşoyî (devok)|Reşwî]]''""", + ) + page_data = parse_page( + self.wxr, + "pisîk", + """== {{ziman|ku}} == +=== Navdêr === +# [[heywan|Heywanek]] +==== Hevmane ==== +{{kol2|ku|cure=Herwiha +| {{hw|ku|pisik||Reşwî}} +}}""", + ) + self.assertEqual(page_data[0]["synonyms"], [{"word": "pisik"}]) From d9a474b461afe65ed4daf2186ea1b871d4610e71 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 28 Jan 2025 17:37:05 +0800 Subject: [PATCH 4/4] [ku] translate tags used in "g" template --- src/wiktextract/extractor/ku/linkage.py | 39 +++++++++++++++++++++---- src/wiktextract/extractor/ku/tags.py | 39 +++++++++++++++++++++++++ tests/test_ku_linkage.py | 32 ++++++++++++++++++++ 3 files changed, 105 insertions(+), 5 deletions(-) create mode 100644 src/wiktextract/extractor/ku/tags.py diff --git a/src/wiktextract/extractor/ku/linkage.py b/src/wiktextract/extractor/ku/linkage.py index 60074d8a..d940e733 100644 --- a/src/wiktextract/extractor/ku/linkage.py +++ b/src/wiktextract/extractor/ku/linkage.py @@ -6,6 +6,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .models import Form, Linkage, WordEntry +from .tags import translate_raw_tags def extract_ku_form_template( @@ -43,7 +44,18 @@ def extract_g_template( t_node: TemplateNode, linkage_type: str = "", sense: str = "", + raw_tags: list[str] = [], ) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for span_tag in expanded_node.find_html( + "span", attr_name="class", attr_value="gender" + ): + for abbr_tag in span_tag.find_html("abbr"): + raw_tag = clean_node(wxr, None, abbr_tag) + if raw_tag not in ["", "?"]: + raw_tags.append(raw_tag) if linkage_type == "": form = Form( form=clean_node( @@ -59,8 +71,10 @@ def extract_g_template( translation=clean_node( wxr, None, t_node.template_parameters.get("w", "") ), + raw_tags=raw_tags, ) if form.form != "": + translate_raw_tags(form) word_entry.forms.append(form) else: l_data = Linkage( @@ -78,8 +92,10 @@ def extract_g_template( wxr, None, t_node.template_parameters.get("w", "") ), sense=sense, + raw_tags=raw_tags, ) if l_data.word != "": + translate_raw_tags(l_data) getattr(word_entry, linkage_type).append(l_data) @@ -191,19 +207,28 @@ def extract_linkage_list_item( linkage_type: str, sense: str, ) -> None: + raw_tags = [] for node in list_item.children: if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: word = clean_node(wxr, None, node) if word != "": if linkage_type != "": - getattr(word_entry, linkage_type).append( - Linkage(word=word, sense=sense) - ) + l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags) + translate_raw_tags(l_data) + getattr(word_entry, linkage_type).append(l_data) else: - word_entry.forms.append(Form(form=word)) + form = Form(form=word, raw_tags=raw_tags) + translate_raw_tags(form) + word_entry.forms.append(form) elif isinstance(node, TemplateNode): if node.template_name == "g": - extract_g_template(wxr, word_entry, node, linkage_type) + extract_g_template( + wxr, + word_entry, + node, + linkage_type=linkage_type, + raw_tags=raw_tags, + ) elif node.template_name.startswith("ku-"): extract_ku_form_template( wxr, @@ -220,6 +245,10 @@ def extract_linkage_list_item( linkage_type=linkage_type, sense=sense, ) + elif node.template_name == "mj": + raw_tag = clean_node(wxr, None, node).strip("() ") + if raw_tag != "": + raw_tags.append(raw_tag) def extract_stûn_template( diff --git a/src/wiktextract/extractor/ku/tags.py b/src/wiktextract/extractor/ku/tags.py new file mode 100644 index 00000000..8cab2f8e --- /dev/null +++ b/src/wiktextract/extractor/ku/tags.py @@ -0,0 +1,39 @@ +from .models import WordEntry + +GENDER_NUMBER_TAGS = { + # https://ku.wiktionary.org/wiki/Modul:gender_and_number + "m": "feminine", + "n": "masculine", + "nt": "neuter", + "g": "common-gender", + "anim": "animate", + "inan": "inanimate", + "animal": "animal-not-person", + "pers": "personal", + "npers": "impersonal", + "vir": "virile", + "nvir": "nonvirile", + "yj": "singular", + "du": "dual", + "pj": "plural", + "impf": "imperfective", + "pf": "perfective", + "gh": "transitive", + "ngh": "intransitive", +} + +TAGS = {**GENDER_NUMBER_TAGS} + + +def translate_raw_tags(data: WordEntry) -> None: + raw_tags = [] + for raw_tag in data.raw_tags: + if raw_tag in TAGS and hasattr(data, "tags"): + tr_tag = TAGS[raw_tag] + if isinstance(tr_tag, str): + data.tags.append(tr_tag) + elif isinstance(tr_tag, list): + data.tags.extend(tr_tag) + else: + raw_tags.append(raw_tag) + data.raw_tags = raw_tags diff --git a/tests/test_ku_linkage.py b/tests/test_ku_linkage.py index 02dec30b..f6562177 100644 --- a/tests/test_ku_linkage.py +++ b/tests/test_ku_linkage.py @@ -106,3 +106,35 @@ def test_kol_hw(self): }}""", ) self.assertEqual(page_data[0]["synonyms"], [{"word": "pisik"}]) + + def test_mj(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + self.wxr.wtp.add_page( + "Şablon:g", + 10, + """[[pişê#Kurmancî|pişê]] m""", + ) + self.wxr.wtp.add_page( + "Şablon:mj", + 10, + """([[{{{1}}}]])""", + ) + page_data = parse_page( + self.wxr, + "pisîk", + """== {{ziman|ku}} == +=== Navdêr === +# [[heywan|Heywanek]] +==== Hevmane ==== +* {{mj|zimanê zarokan}} {{g|ku|pişê|z=m}}""", + ) + self.assertEqual( + page_data[0]["synonyms"], + [ + { + "word": "pişê", + "tags": ["feminine"], + "raw_tags": ["zimanê zarokan"], + } + ], + )