Skip to content

Commit

Permalink
Inline images should print alt= text
Browse files Browse the repository at this point in the history
If an image is inline (it doesn't have any of a set
of arguments that, like 'right' or 'thumb' etc.), then
its `alt=` text should be printed when returned from
clean_value.

To make it clearer that this is an `alt` text, and to
make post-processing easier, we add `[Alt: ` + `]` around
the text.
  • Loading branch information
kristian-clausal committed Mar 12, 2024
1 parent e548f3a commit effcbd6
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 31 deletions.
11 changes: 10 additions & 1 deletion src/wiktextract/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,6 +1326,9 @@ def remove_italic_and_bold(text: str) -> str:
new_text_parts = new_text_parts[:-1] # remove last \n
return "".join(new_text_parts)

# regex to find File/Image link attributes that would mean an image
# is *not* inline
inline_re = re.compile(r"\|\s*(right|left|center|thumb|frame)\s*\|")

def clean_value(
wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False
Expand Down Expand Up @@ -1357,7 +1360,13 @@ def repl_link(m: re.Match) -> str:

def repl_link_bars(m: re.Match) -> str:
lnk = m.group(1)
if re.match(r"(?si)(File|Image)\s*:", lnk):
if wxr.wtp.file_aliases_re.match(lnk):
# Handle File / Image / Fichier 'links' here.
if not inline_re.match(m.group(0)) and "alt" in m.group(0):
# This image should be inline, so let's print its alt text
alt_m = re.search(r"\|\s*alt\s*=([^]|]+)(\||\]\])", m.group(0))
if alt_m is not None:
return "[Alt: " + alt_m.group(1) + "]"
return ""
# m.group(5) is always the last matching group because you can
# only access the last matched group; the indexes don't 'grow'
Expand Down
30 changes: 0 additions & 30 deletions src/wiktextract/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,32 +1029,6 @@ def decode_tags(

return tagsets, topics

# Kludge to a wide-spread problem with Latin, where a lot of
# "indicative/imperative" style combinations aren't in
# xlat_tags_map. Instead of adding every possible combination
# manually, we look if there are any slashes in the string,
# then check for valid stuff in xlat_tags_map (like
# "first/third-person"), and if not, split on "/"
# and append on the string; will definitely give errors,
# but less of them.
# new_parts = []
# for part in parts:
# new_seg = ""
# if "/" in part:
# for w in part.split():
# if w in xlat_tags_map:
# new_seg += w + " "
# elif "/" in w:
# for ww in w.split("/"):
# new_seg += ww + " "
# else:
# new_seg += w + " "
# else:
# new_parts.append(part)
# continue
# new_parts.append(new_seg.strip())
# parts = new_parts


def decode_tags1(
src: str,
Expand Down Expand Up @@ -2437,9 +2411,6 @@ def strokes_repl(m):
related = alt_related
tagsets = alt_tagsets




# print("FORM END: tagsets={} related={}".format(tagsets, related))
if not tagsets:
continue
Expand Down Expand Up @@ -2540,7 +2511,6 @@ def strokes_repl(m):
prev_tags = tagsets
following_tags = None


# Finally, if we collected hirakana/katakana, add them now
if hiragana:
add_related(
Expand Down
9 changes: 9 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,15 @@ def test_cv_link11(self):
v = clean_value(self.wxr, v)
self.assertEqual(v, "Alt Text")

def test_cv_link12(self):
# if a File, Image or Wtp.file_alias link (an image)
# does not have anything from a set of parameters (left, right,
# thumb etc.) that would not make it inline, it is an inline
# image and its alt= text should be printer with [Alt: ...]
v = "[[File:bar.JPG|conf bar|baz|baz2|baz3|baz4|alt=Bar]]"
v = clean_value(self.wxr, v)
self.assertEqual(v, "[Alt: Bar]")

def test_cv_url1(self):
v = "This is a [http://ylonen.org test]."
v = clean_value(self.wxr, v)
Expand Down

0 comments on commit effcbd6

Please sign in to comment.