Inline images should print alt= text

If an image is inline (it doesn't have any of a set of arguments that, like 'right' or 'thumb' etc.), then its `alt=` text should be printed when returned from clean_value. To make it clearer that this is an `alt` text, and to make post-processing easier, we add `[Alt: ` + `]` around the text.
tatuylonen · Mar 12, 2024 · effcbd6 · effcbd6
1 parent e548f3a
commit effcbd6
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 31 deletions.
diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py
@@ -1326,6 +1326,9 @@ def remove_italic_and_bold(text: str) -> str:
     new_text_parts = new_text_parts[:-1]  # remove last \n
     return "".join(new_text_parts)
 
+# regex to find File/Image link attributes that would mean an image
+# is *not* inline
+inline_re = re.compile(r"\|\s*(right|left|center|thumb|frame)\s*\|")
 
 def clean_value(
     wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False
@@ -1357,7 +1360,13 @@ def repl_link(m: re.Match) -> str:
 
     def repl_link_bars(m: re.Match) -> str:
         lnk = m.group(1)
-        if re.match(r"(?si)(File|Image)\s*:", lnk):
+        if wxr.wtp.file_aliases_re.match(lnk):
+            # Handle File / Image / Fichier 'links' here.
+            if not inline_re.match(m.group(0)) and "alt" in m.group(0):
+                # This image should be inline, so let's print its alt text
+                alt_m = re.search(r"\|\s*alt\s*=([^]|]+)(\||\]\])", m.group(0))
+                if alt_m is not None:
+                    return "[Alt: " + alt_m.group(1) + "]"
             return ""
         # m.group(5) is always the last matching group because you can
         # only access the last matched group; the indexes don't 'grow'

diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py
@@ -1029,32 +1029,6 @@ def decode_tags(
 
     return tagsets, topics
 
-    # Kludge to a wide-spread problem with Latin, where a lot of
-    # "indicative/imperative" style combinations aren't in
-    # xlat_tags_map. Instead of adding every possible combination
-    # manually, we look if there are any slashes in the string,
-    # then check for valid stuff in xlat_tags_map (like
-    # "first/third-person"), and if not, split on "/"
-    # and append on the string; will definitely give errors,
-    # but less of them.
-    # new_parts = []
-    # for part in parts:
-    #     new_seg = ""
-    #     if "/" in part:
-    #         for w in part.split():
-    #             if w in xlat_tags_map:
-    #                 new_seg += w + " "
-    #             elif "/" in w:
-    #                 for ww in w.split("/"):
-    #                     new_seg += ww + " "
-    #             else:
-    #                 new_seg += w + " "
-    #     else:
-    #         new_parts.append(part)
-    #         continue
-    #     new_parts.append(new_seg.strip())
-    # parts = new_parts
-
 
 def decode_tags1(
     src: str,
@@ -2437,9 +2411,6 @@ def strokes_repl(m):
                     related = alt_related
                     tagsets = alt_tagsets
 
-
-
-
             # print("FORM END: tagsets={} related={}".format(tagsets, related))
             if not tagsets:
                 continue
@@ -2540,7 +2511,6 @@ def strokes_repl(m):
                     prev_tags = tagsets
                     following_tags = None
 
-
     # Finally, if we collected hirakana/katakana, add them now
     if hiragana:
         add_related(

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -118,6 +118,15 @@ def test_cv_link11(self):
         v = clean_value(self.wxr, v)
         self.assertEqual(v, "Alt Text")
 
+    def test_cv_link12(self):
+        # if a File, Image or Wtp.file_alias link (an image)
+        # does not have anything from a set of parameters (left, right,
+        # thumb etc.) that would not make it inline, it is an inline
+        # image and its alt= text should be printer with [Alt: ...]
+        v = "[[File:bar.JPG|conf bar|baz|baz2|baz3|baz4|alt=Bar]]"
+        v = clean_value(self.wxr, v)
+        self.assertEqual(v, "[Alt: Bar]")
+
     def test_cv_url1(self):
         v = "This is a [http://ylonen.org test]."
         v = clean_value(self.wxr, v)