Skip to content

Commit

Permalink
Merge pull request #519 from xxyzz/zh
Browse files Browse the repository at this point in the history
Add `raw_tags` fields to zh edition pydantic models
  • Loading branch information
xxyzz authored Feb 27, 2024
2 parents 2c2466d + 33aa497 commit 10eb259
Show file tree
Hide file tree
Showing 16 changed files with 69 additions and 45 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/descendant.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def extract_descendant_list_item(
descendant_data.ruby = ruby_data
descendant_data.word = clean_node(wxr, None, child_node)
if "qualifier-content" in class_names:
descendant_data.tags.append(
descendant_data.raw_tags.append(
clean_node(wxr, None, child_node)
)
elif child_node.tag == "i":
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,6 @@ def extract_gloss_and_tags(raw_gloss: str) -> Sense:
tags += re.split(split_tag_regex, rear_label)

gloss = raw_gloss[front_tag_end + 1 : rear_tag_start].strip()
return Sense(glosses=[gloss], raw_glosses=[raw_gloss], tags=tags)
return Sense(glosses=[gloss], raw_glosses=[raw_gloss], raw_tags=tags)
else:
return Sense(glosses=[raw_gloss])
24 changes: 18 additions & 6 deletions src/wiktextract/extractor/zh/headword_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ def extract_headword_line(
forms_start_index = index + 1
for abbr_tag in child.find_html("abbr"):
gender = abbr_tag.children[0]
page_data[-1].tags.append(GENDERS.get(gender, gender))
if gender in GENDERS:
page_data[-1].tags.append(GENDERS[gender])
else:
page_data[-1].raw_tags.append(gender)
if lang_code == "ja":
for span_child in child.find_html(
"strong", attr_name="class", attr_value="headword"
Expand Down Expand Up @@ -133,9 +136,10 @@ def process_forms_text(
form = clean_node(wxr, None, node_without_ruby)
else:
form = clean_node(wxr, None, node)
form_tags = extract_headword_tags(
raw_form_tags = extract_headword_tags(
clean_node(wxr, None, tag_nodes).strip("() ")
)
form_tags = []
# check if next tag has gender data
if index < len(striped_nodes) - 1:
next_node = striped_nodes[index + 1]
Expand All @@ -146,9 +150,17 @@ def process_forms_text(
and "gender" in next_node.attrs.get("class", "")
):
gender = clean_node(wxr, None, next_node)
form_tags.append(GENDERS.get(gender, gender))

form_data = Form(form=form, tags=form_tags, ruby=ruby_data)
if gender in GENDERS:
form_tags.append(GENDERS[gender])
else:
raw_form_tags.append(gender)

form_data = Form(
form=form,
raw_tags=raw_form_tags,
tags=form_tags,
ruby=ruby_data,
)
page_data[-1].forms.append(form_data)
elif (
node.tag == "span"
Expand All @@ -167,7 +179,7 @@ def process_forms_text(
clean_node(wxr, page_data[-1], tag_nodes).strip("() ")
)
if len(tags_list) > 0:
page_data[-1].tags.extend(tags_list)
page_data[-1].raw_tags.extend(tags_list)
else:
clean_node(wxr, page_data[-1], tag_nodes) # find categories

Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/zh/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ def extract_ja_i_template(
table_header = clean_node(wxr, None, child.children)
else:
inflection_data = Form(
tags=[table_header], source="inflection"
raw_tags=[table_header], source="inflection"
)
cell_node_index = 0
keys = ["form", "hiragana", "roman"]
for row_child in child.children:
if isinstance(row_child, WikiNode):
if row_child.kind == NodeKind.TABLE_HEADER_CELL:
inflection_data.tags.append(
inflection_data.raw_tags.append(
clean_node(wxr, None, row_child)
)
elif row_child.kind == NodeKind.TABLE_CELL:
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/zh/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def extract_linkages(
return sense
elif template_name in {"qualifier", "qual"}:
not_term_indexes.add(index)
linkage_data.tags.append(
linkage_data.raw_tags.append(
clean_node(wxr, None, item_child).strip("()")
)
elif template_name.lower() in DESCENDANT_TEMPLATES:
Expand Down Expand Up @@ -165,7 +165,7 @@ def extract_saurus_template(
if thesaurus.roman is not None:
linkage_data.roman = thesaurus.roman
if thesaurus.tags is not None:
linkage_data.tags = thesaurus.tags.split("|")
linkage_data.raw_tags = thesaurus.tags.split("|")
if thesaurus.language_variant is not None:
linkage_data.language_variant = thesaurus.language_variant
if len(sense) > 0:
Expand All @@ -192,7 +192,7 @@ def extract_zh_dial_template(
if len(sense) > 0:
linkage_data.sense = sense
if len(tags) > 0:
linkage_data.tags = tags
linkage_data.raw_tags = tags
pre_data = getattr(page_data[-1], linkage_type)
pre_data.append(linkage_data)

Expand Down
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class Sense(ChineseBaseModel):
glosses: list[str] = []
raw_glosses: list[str] = Field([], description="Gloss text without tags")
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
Expand All @@ -41,6 +42,7 @@ class Sense(ChineseBaseModel):
class Form(ChineseBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []
source: str = ""
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
hiragana: str = ""
Expand All @@ -57,6 +59,7 @@ class Sound(ChineseBaseModel):
mp3_url: str = ""
opus_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
homophone: str = ""


Expand All @@ -68,6 +71,7 @@ class Translation(ChineseBaseModel):
word: str = Field(description="Translation term")
sense: str = Field("", description="Translation gloss")
tags: list[str] = []
raw_tags: list[str] = []
roman: str = Field("", description="Roman script")
alt: str = Field("", description="Alternative form")
lit: str = Field("", description="Literal translation for the term")
Expand All @@ -76,6 +80,7 @@ class Translation(ChineseBaseModel):
class Linkage(ChineseBaseModel):
word: str = ""
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""
sense: str = ""
language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field(
Expand All @@ -90,6 +95,7 @@ class Descendant(ChineseBaseModel):
word: str = ""
roman: str = ""
tags: list[str] = []
raw_tags: list[str] = []
descendants: list["Descendant"] = []
ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")

Expand Down Expand Up @@ -126,6 +132,7 @@ class WordEntry(ChineseBaseModel):
categories: list[str] = []
notes: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
descendants: list[Descendant] = []
redirects: list[str] = Field(
[],
Expand Down
10 changes: 5 additions & 5 deletions src/wiktextract/extractor/zh/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def extract_pronunciation_recursively(
last_sounds_list = page_data[-1].sounds
for index in range(len(last_sounds_list)):
if last_sounds_list[index].audio == "" and (
tags == last_sounds_list[index].tags[:-1]
tags == last_sounds_list[index].raw_tags[:-1]
or lang_code != "zh"
):
for key, value in create_audio_url_dict(data).items():
Expand All @@ -69,7 +69,7 @@ def extract_pronunciation_recursively(
base_data,
lang_code,
rest_children,
data.tags[:-1],
data.raw_tags[:-1],
)
elif isinstance(data, list):
# list item is a tag
Expand Down Expand Up @@ -146,7 +146,7 @@ def extract_pronunciation_item(
tags, split_pronunciation_tags(sound_tags_text)
)
if len(ipa) > 0:
data = Sound(tags=new_tags)
data = Sound(raw_tags=new_tags)
ipa_key = "zh_pron" if lang_code == "zh" else "ipa"
setattr(data, ipa_key, ipa[0].strip())
return data
Expand Down Expand Up @@ -176,7 +176,7 @@ def process_homophone_data(
"span", attr_name="lang"
):
sound_data = Sound(
homophone=clean_node(wxr, None, span_node), tags=tags
homophone=clean_node(wxr, None, span_node), raw_tags=tags
)
page_data[-1].sounds.append(sound_data)
elif (
Expand All @@ -190,6 +190,6 @@ def process_homophone_data(
"span", attr_name="lang"
):
sound_data = Sound(
homophone=clean_node(wxr, None, span_node), tags=tags
homophone=clean_node(wxr, None, span_node), raw_tags=tags
)
page_data[-1].sounds.append(sound_data)
1 change: 0 additions & 1 deletion src/wiktextract/extractor/zh/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@
"上位語": "hypernyms",
"上位词": "hypernyms",
"上義詞": "hypernyms",
"上义词": "hypernyms",
"下义词": "hyponyms",
"下位詞": "hyponyms",
"下位語": "hyponyms",
Expand Down
8 changes: 4 additions & 4 deletions src/wiktextract/extractor/zh/thesaurus.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ def parse_ja_thesaurus_term(
linkage: Optional[str],
term_str: str,
) -> list[ThesaurusTerm]:
tags = None
# tags = None
roman = None
if term_str.startswith("("): # has qualifier
qual_bracket_idx = term_str.find(")")
tags = "|".join(term_str[1:qual_bracket_idx].split(", "))
# tags = "|".join(term_str[1:qual_bracket_idx].split(", "))
term_str = term_str[qual_bracket_idx + 2 :]

thesaurus = []
Expand All @@ -54,7 +54,7 @@ def parse_ja_thesaurus_term(
pos=pos,
linkage=linkage,
term=term,
tags=tags,
# tags=tags,
roman=roman,
sense=sense,
)
Expand Down Expand Up @@ -93,7 +93,7 @@ def parse_zh_thesaurus_term(
pos=pos,
linkage=linkage,
term=variant_term,
tags="|".join(tags) if len(tags) > 0 else None,
# tags="|".join(tags) if len(tags) > 0 else None,
roman=roman,
sense=sense,
language_variant=variant_type,
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/zh/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def process_translation_list_item(
if "gender" in class_str:
for abbr_tag in span_node.find_html("abbr"):
if len(abbr_tag.attrs.get("title")) > 0:
tr_data.tags.append(
tr_data.raw_tags.append(
clean_node(
wxr, None, abbr_tag.attrs.get("title")
)
Expand All @@ -129,12 +129,12 @@ def process_translation_list_item(
for span_node in expanded_template.find_html("span"):
tag = span_node.attrs.get("title", "")
if len(tag) > 0:
tr_data.tags.append(tag.strip())
tr_data.raw_tags.append(tag.strip())
find_title = True
if not find_title:
tag = clean_node(wxr, None, child)
if len(tag) > 0:
tr_data.tags.append(tag.strip("()"))
tr_data.raw_tags.append(tag.strip("()"))
elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK:
if len(tr_data.word) > 0:
page_data[-1].translations.append(tr_data.model_copy(deep=True))
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zh_descendant.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_roman_only_list(self):
{
"lang_code": "za",
"lang": "壯語",
"tags": ["仿譯"],
"raw_tags": ["仿譯"],
"word": "mwngz ndei",
},
)
Expand Down
8 changes: 4 additions & 4 deletions tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,22 +57,22 @@ def test_example_list(self) -> None:
{
"glosses": ["有趣的:", "有趣的"],
"raw_glosses": ["(棄用) 有趣的:"],
"tags": ["棄用"],
"raw_tags": ["棄用"],
},
{
"glosses": ["有趣的:", "美味的"],
"raw_glosses": ["(棄用) 有趣的:"],
"tags": ["棄用"],
"raw_tags": ["棄用"],
},
{
"glosses": ["有趣的:", "漂亮的"],
"raw_glosses": ["(棄用) 有趣的:"],
"tags": ["棄用"],
"raw_tags": ["棄用"],
},
{
"glosses": ["有趣的:", "很好的,卓越的"],
"raw_glosses": ["(棄用) 有趣的:"],
"tags": ["棄用"],
"raw_tags": ["棄用"],
},
],
)
Expand Down
16 changes: 11 additions & 5 deletions tests/test_zh_headword.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ def test_english_headword(self) -> None:
"lang_code": "en",
"lang": "英語",
"forms": [
{"form": "manga", "tags": ["複數"]},
{"form": "mangas", "tags": ["複數"]},
{"form": "manga", "raw_tags": ["複數"]},
{"form": "mangas", "raw_tags": ["複數"]},
],
"tags": ["可數", "不可數"],
"raw_tags": ["可數", "不可數"],
}
],
)
Expand All @@ -70,8 +70,14 @@ def test_headword_gender(self) -> None:
"lang_code": "en",
"lang": "英語",
"forms": [
{"form": "manga's", "tags": ["複數"]},
{"form": "mangaatje", "tags": ["指小詞", "neuter"]},
{"form": "manga's", "raw_tags": ["複數"]},
{
"form": "mangaatje",
"raw_tags": [
"指小詞",
],
"tags": ["neuter"],
},
],
"tags": ["masculine"],
}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zh_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_ja_i_template(self, mock_get_page) -> None:
"hiragana": "おかしかろ",
"roman": "okashikaro",
"source": "inflection",
"tags": ["基本形", "未然形"],
"raw_tags": ["基本形", "未然形"],
},
],
)
10 changes: 5 additions & 5 deletions tests/test_zh_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def test_homophone_table(self):
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[0].sounds],
[
{"homophone": "大姑", "tags": ["同音詞"]},
{"homophone": "小姑", "tags": ["同音詞"]},
{"homophone": "大姑", "raw_tags": ["同音詞"]},
{"homophone": "小姑", "raw_tags": ["同音詞"]},
],
)

Expand All @@ -54,8 +54,8 @@ def test_homophone_template(self):
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[0].sounds],
[
{"homophone": "大矢", "tags": ["同音詞"]},
{"homophone": "大宅", "tags": ["同音詞"]},
{"homophone": "大谷", "tags": ["同音詞"]},
{"homophone": "大矢", "raw_tags": ["同音詞"]},
{"homophone": "大宅", "raw_tags": ["同音詞"]},
{"homophone": "大谷", "raw_tags": ["同音詞"]},
],
)
Loading

0 comments on commit 10eb259

Please sign in to comment.