]> git.neil.brown.name Git - edlib.git/commitdiff
Use w3m to decode HTML in email messages
authorNeilBrown <neil@brown.name>
Sun, 7 May 2023 01:45:16 +0000 (11:45 +1000)
committerNeilBrown <neil@brown.name>
Sun, 7 May 2023 01:45:16 +0000 (11:45 +1000)
Use "w3m -halfdump" and parse out the remaining markup
to give bold and links etc.

Signed-off-by: NeilBrown <neil@brown.name>
DOC/TODO.md
doc-email.c
python/lib-html-w3m.py [new file with mode: 0644]
python/module-notmuch.py

index 7126ad0928976425097107ae1ca55694322ffc9a..7c1afa4d08dbb50ef16f7abcd58575c8b81c9978 100644 (file)
@@ -6,7 +6,7 @@ Current priorities
 
 - [X] catch infinite loops more, particularly in b64/utf8/multipart movement
       e.g. cause all key_handle to fail if it has been 30 seconds since waiting for input.
-- [ ] w3m -halfdump -no-cookie -I UTF-8 -O UTF-8 -o ext_halfdump=1 
+- [X] w3m -halfdump -no-cookie -I UTF-8 -O UTF-8 -o ext_halfdump=1 -o display_image=off
          -o display_ins_del=2 -o pre_conv=1 -cols $COLS -T text/html 
        Then parse the output to display the text with highlights and
       links.
index af7975c871e6b7a18653a202f39589f193175725..962b78d3f264824f09dc5f6a49c7399b2635f0d3 100644 (file)
@@ -583,8 +583,11 @@ static bool handle_text(struct pane *p safe, char *type, char *xfer, char *disp,
                asprintf(&ctype, "%1.*s/%1.*s", majlen, major, minlen, minor);
        else
                asprintf(&ctype, "%1.*s", majlen, major);
-       if (ctype && strcasecmp(ctype, "text/html") == 0)
-               transformed = call_ret(pane, "html-to-text", h);
+       if (ctype && strcasecmp(ctype, "text/html") == 0) {
+               transformed = call_ret(pane, "html-to-text-w3m", h);
+               if (!transformed)
+                       transformed = call_ret(pane, "html-to-text", h);
+       }
        if (ctype && strcasecmp(ctype, "text/calendar") == 0)
                transformed = call_ret(pane, "ical-to-text", h);
        if (ctype && strcasecmp(ctype, "application/pdf") == 0)
@@ -1253,6 +1256,8 @@ void edlib_init(struct pane *ed safe)
                  "attach-email-view");
 
        call("global-load-module", ed, 0, NULL, "lib-html-to-text");
+       call("global-load-module", ed, 0, NULL, "lib-html-w3m")
+
        call("global-load-module", ed, 0, NULL, "lib-pdf-to-text");
        call("global-load-module", ed, 0, NULL, "lib-ical-to-text");
 }
diff --git a/python/lib-html-w3m.py b/python/lib-html-w3m.py
new file mode 100644 (file)
index 0000000..0e629bb
--- /dev/null
@@ -0,0 +1,174 @@
+# -*- coding: utf-8 -*-
+# Copyright Neil Brown ©2023 <neil@brown.name>
+# May be distributed under terms of GPLv2 - see file:COPYING
+#
+# The html-to-text-w3m function extracts text from the given child and
+# transforms it with w3m to a "half-dump" format which is simplified HTML
+# which only marks up links and bold and similar which affect appearance
+# of characters but not their position.
+# The view-default is set to w3m-halfdump which hides the markup text and
+# applied the changes to the text as render attributes.
+#
+
+import subprocess
+
+def get_attr(tagl, tag, attr):
+    # Find attr="stuff" in tag, but search for tag in tagl
+    # which is a lower-cased version.
+    k = tagl.find(attr+'="')
+    e = -1
+    if k > 0:
+        e = tagl.find('"', k+len(attr)+2)
+    if e > k:
+        return tag[k+len(attr)+2:e]
+    return None
+
+def html_to_w3m(key, home, focus, comm2, **a):
+    htmlb = focus.call("doc:get-str", ret='bytes')
+    if not htmlb:
+        return edlib.Efail
+    html = htmlb.decode("utf-8", "ignore")
+    p = subprocess.Popen(["/usr/bin/w3m", "-halfdump", "-o", "ext_halfdump=1",
+                          "-I", "UTF-8", "-O", "UTF-8",
+                          "-o", "display_image=off",
+                          "-o", "pre_conv=1",
+                          "-cols", "72",
+                          "-T", "text/html"],
+                         close_fds = True,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         stdin=subprocess.PIPE)
+    out,err = p.communicate(html.encode())
+    if err:
+        edlib.LOG("w3m:", err.decode("utf-8","ignore"))
+    if out:
+        doc = focus.call("doc:from-text", "html-document",
+                         out.decode("utf-8","ignore"),
+                         ret='pane')
+    else:
+        doc = focus.call("doc:from-text", "html-document",
+                         err.decode("utf-8","ignore"),
+                         ret='pane')
+
+    parse_halfdump(doc)
+    comm2("cb", doc)
+    return 1
+
+def parse_halfdump(doc):
+    # recognise and markup
+    # <[Bb]> .. </b>  bold
+    # <a href=....>...</a> anchor
+    # <internal>...</internal> hide
+    # <anything-else> - ignore
+    #
+    # &foo; - replace with one char:
+    #   amp - &
+    #   rsquo - '
+    #   emsp
+    #   lt  - <
+    #   gt  - >
+    #   #x.... utf-8 hex
+
+    m = edlib.Mark(doc)
+    bold = False; internal = False; imgalt = False; url = None
+    while True:
+        try:
+            if bold or internal or url or imgalt:
+                len = doc.call("text-search", "(^.|<[^>]*>)", m)
+            else:
+                len = doc.call("text-search", "<[^>]*>", m)
+            len -= 1
+        except:
+            break
+        if len == 1:
+            # Found start of line - re-assert things
+            if bold:
+                doc.call("doc:set-attr", 1, m, "render:bold", "1")
+            if internal:
+                doc.call("doc:set-attr", 1, m, "render:internal", "1")
+            if imgalt:
+                doc.call("doc:set-attr", 1, m, "render:imgalt", "1")
+            if urltag:
+                doc.call("doc:set-attr", 1, m, "render:url", urltag)
+            continue
+
+        st = m.dup()
+        i = 0
+        while i < len:
+            doc.prev(st)
+            i += 1
+        doc.call('doc:set-attr', 1, st, "render:hide", "%d" % len)
+
+        tag = doc.call("doc:get-str", st, m, ret='str')
+        tagl = tag.lower()
+        if tagl == "<b>":
+            doc.call("doc:set-attr", 1, m, "render:bold", "1")
+            bold=True
+        elif tagl == "</b>" and bold:
+            doc.call("doc:set-attr", 1, m, "render:bold", "0")
+            bold = False
+        elif tagl == "<internal>":
+            doc.call("doc:set-attr", 1, m, "render:internal", "1")
+            internal = True
+        elif tagl == "</internal>":
+            doc.call("doc:set-attr", 1, m, "render:internal", "0")
+            internal = False
+        elif tagl[:9] == "<img_alt ":
+            doc.call("doc:set-attr", 1, m, "render:imgalt", "1")
+            imgalt = True
+        elif tagl == "</img_alt>":
+            doc.call("doc:set-attr", 1, m, "render:imgalt", "0")
+            imgalt = False
+        elif tagl[:3] == "<a ":
+            url = get_attr(tagl, tag, "href")
+            urltag = get_attr(tagl, tag, "hseq")
+            if not urltag:
+                urltag = doc['next-url-tag']
+                if not urltag:
+                    urltag = "1"
+                doc['next-url-tag'] = "%d" % (int(urltag)+1)
+                urltag = "i" + urltag
+            urltag = "w3m-" + urltag
+            if url:
+                doc.call("doc:set-attr", 1, m, "render:url", urltag)
+                doc["url:" + urltag] = url
+        elif tagl == "</a>":
+            doc.call("doc:set-attr", 1, m, "render:url-end", urltag)
+            url = None; urltag = None
+
+    m = edlib.Mark(doc)
+    while True:
+        try:
+            len = doc.call("text-search", "&[#A-Za-z0-9]*;", m)
+            len -= 1
+        except:
+            break
+        st = m.dup()
+        i = 0
+        while i < len:
+            doc.prev(st)
+            i += 1
+        name = doc.call("doc:get-str", st, m, ret='str')
+        char = name[1:-1]
+        if char == "amp":
+            char = "&"
+        elif char == "lt":
+            char = "<<"
+        elif char == "gt":
+            char = ">"
+        elif char[:2] == "#x":
+            char = chr(int(char[2:], 16))
+        elif char[:2] == "#":
+            char = chr(int(char[1:], 10))
+        elif char == "zwnj":
+            char = ""
+        elif char == "emsp":
+            char = " "
+        elif char == "rsquo":
+            char = chr(8217)
+        else:
+            char = "!" + char
+        doc.call('doc:set-attr', 1, st, "render:char", "%d:%s" % (len,char))
+
+if "editor" in globals():
+    editor.call("global-set-command", "html-to-text-w3m", html_to_w3m)
index 65d1a2997ca41bf764ff339b9bc9779e768f0abc..4fc634d2e5e243da76b7b90b11b5169b97416e47 100644 (file)
@@ -3471,10 +3471,40 @@ class notmuch_message_view(edlib.Pane):
         if str == "render:rfc822header-to":
             comm2("attr:callback", focus, int(str2), mark, "fg:blue,bold", 120)
             return 1
-        if str == "render:url":
-            c=str2.index(':')
-            comm2("attr:callback", focus, int(str2[:c]), mark,
-                  "fg:cyan-60,underline,active-tag:url,url-tag="+str2[c+1:], 120)
+        if str == "render:hide":
+            comm2("attr:callback", focus, int(str2), mark, "hide", 100000)
+        if str == "render:bold":
+            comm2("attr:callback", focus, 100000 if str2 == "1" else -1,
+                  mark, "bold", 120)
+        if str == "render:internal":
+            comm2("attr:callback", focus, 100000 if str2 == "1" else -1,
+                  mark, "hide", 120)
+        if str == "render:imgalt":
+            comm2("attr:callback", focus, 100000 if str2 == "1" else -1,
+                  mark, "fg:green-60", 120)
+        if str[:10] == "render:url":
+            w = str2.split(':')
+            if len(w) == 2:
+                tg = w[1]
+                leng = int(w[0])
+            else:
+                tg = str2
+                leng = 100000
+            if str == "render:url-end":
+                leng = -1
+            comm2("attr:callback", focus, leng, mark,
+                  "fg:cyan-60,underline,active-tag:url,url-tag="+tg, 120)
+        if str == "render:char":
+            w = str2.split(':')
+            attr = None
+            if w[1][0] == '!' and w[1] != '!':
+                # not recognised, so highlight the name
+                attr = "fg:magenta-60,bold"
+            comm2("attr:callback", focus, int(w[0]), mark,
+                  attr, 120, w[1])
+            # Don't show the htm entity description, just the rendering.
+            comm2("attr:callback", focus, int(w[0]), mark,
+                  "hide", 60000)
         if str == 'start-of-line':
             m = self.vmark_at_or_before(self.qview, mark)
             bg = None