diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py new file mode 100644 index 000000000..51562d419 --- /dev/null +++ b/src/wiktextract/extractor/es/gloss.py @@ -0,0 +1,60 @@ +import re +from typing import List +from wiktextract.extractor.es.models import Sense, WordEntry +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext +from wikitextprocessor import WikiNode, NodeKind +from wikitextprocessor.parser import WikiNodeChildrenList + + +def extract_gloss( + wxr: WiktextractContext, + page_data: List[WordEntry], + list_node: WikiNode, +) -> None: + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + gloss_data = Sense(glosses=[]) + + definition: WikiNodeChildrenList = [] + other: WikiNodeChildrenList = [] + + for node in list_item.definition: + if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + other.append(node) + else: + definition.append(node) + + list_item.definition + + gloss = clean_node(wxr, gloss_data, definition) + gloss_data.glosses.append(gloss) + + gloss_note = clean_node(wxr, gloss_data, list_item.children) + + match = re.match(r"^(\d+)", gloss_note) + + if match: + gloss_data["senseid"] = int(match.group(1)) + tag_string = gloss_note[len(match.group(1)) :].strip() + else: + tag_string = gloss_data["tags"] = gloss_note.strip() + + # split tags by comma or "y" + tags = re.split(r",|y", tag_string) + for tag in tags: + tag = ( + tag.strip() + .removesuffix(".") + .removesuffix("Main") + .removeprefix("Main") + ) + if tag: + gloss_data["tags"].append(tag) + + if other: + wxr.wtp.debug( + f"Found nodes that are not part of definition: {other}", + sortid="extractor/es/gloss/extract_gloss/46", + ) + + page_data[-1].senses.append(gloss_data) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 1250bedb2..4f695b301 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -79,6 +79,9 @@ class Sense(LoggingExtraFieldsModel): subsenses: list["Sense"] = Field( default=[], description="List of subsenses" ) + senseid: Optional[int] = Field( + default=None, description="Sense number used in Wiktionary" + ) class WordEntry(LoggingExtraFieldsModel): diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 538a94bed..3d7642256 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -5,6 +5,7 @@ from wikitextprocessor import NodeKind, WikiNode from wiktextract.datautils import append_base_data +from wiktextract.extractor.es.gloss import extract_gloss from wiktextract.extractor.es.pronunciation import extract_pronunciation from wiktextract.extractor.es.models import WordEntry, PydanticLogger @@ -76,9 +77,13 @@ def process_pos_block( ): # XXX: Extract forms pass - elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: - # XXX: Extract data - pass + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LIST + and child.sarg == ";" + ): + extract_gloss(wxr, page_data, child) + else: # XXX: Extract data pass diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py new file mode 100644 index 000000000..ed3cbe487 --- /dev/null +++ b/tests/test_es_gloss.py @@ -0,0 +1,88 @@ +from typing import List +import unittest + +from wikitextprocessor import Wtp +from wiktextract.extractor.es.gloss import extract_gloss +from wiktextract.extractor.es.models import WordEntry + +from wiktextract.config import WiktionaryConfig +from wiktextract.wxr_context import WiktextractContext + + +class TestESGloss(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_page_data(self) -> List[WordEntry]: + return [WordEntry(word="test", lang_code="es", lang_name="Language")] + + def test_es_extract_glosses(self): + # https://es.wiktionary.org/wiki/ayudar + + self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir") + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + """;1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo. +;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]""" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Contribuir esfuerzo o recursos para la realización de algo." + ], + "senseid": 1, + }, + { + "glosses": [ + "Por antonomasia, cooperar a que alguno salga de una situación dificultosa" + ], + "senseid": 2, + }, + ], + ) + + def test_es_extract_gloss_categories(self): + # https://es.wiktionary.org/wiki/amor + self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento") + self.wxr.wtp.add_page( + "Plantilla:sentimientos", + 10, + "Humanidades. [[Categoría:ES:Sentimientos]]", + ) + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + ";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa" + ], + "senseid": 1, + "tags": ["Humanidades."], + "categories": ["ES:Sentimientos"], + } + ], + )