asprintf(&ctype, "%1.*s/%1.*s", majlen, major, minlen, minor);
else
asprintf(&ctype, "%1.*s", majlen, major);
- if (ctype && strcasecmp(ctype, "text/html") == 0)
- transformed = call_ret(pane, "html-to-text", h);
+ if (ctype && strcasecmp(ctype, "text/html") == 0) {
+ transformed = call_ret(pane, "html-to-text-w3m", h);
+ if (!transformed)
+ transformed = call_ret(pane, "html-to-text", h);
+ }
if (ctype && strcasecmp(ctype, "text/calendar") == 0)
transformed = call_ret(pane, "ical-to-text", h);
if (ctype && strcasecmp(ctype, "application/pdf") == 0)
"attach-email-view");
call("global-load-module", ed, 0, NULL, "lib-html-to-text");
+ call("global-load-module", ed, 0, NULL, "lib-html-w3m")
+
call("global-load-module", ed, 0, NULL, "lib-pdf-to-text");
call("global-load-module", ed, 0, NULL, "lib-ical-to-text");
}
--- /dev/null
+# -*- coding: utf-8 -*-
+# Copyright Neil Brown ©2023 <neil@brown.name>
+# May be distributed under terms of GPLv2 - see file:COPYING
+#
+# The html-to-text-w3m function extracts text from the given child and
+# transforms it with w3m to a "half-dump" format which is simplified HTML
+# which only marks up links and bold and similar which affect appearance
+# of characters but not their position.
+# The view-default is set to w3m-halfdump which hides the markup text and
+# applied the changes to the text as render attributes.
+#
+
+import subprocess
+
+def get_attr(tagl, tag, attr):
+ # Find attr="stuff" in tag, but search for tag in tagl
+ # which is a lower-cased version.
+ k = tagl.find(attr+'="')
+ e = -1
+ if k > 0:
+ e = tagl.find('"', k+len(attr)+2)
+ if e > k:
+ return tag[k+len(attr)+2:e]
+ return None
+
+def html_to_w3m(key, home, focus, comm2, **a):
+ htmlb = focus.call("doc:get-str", ret='bytes')
+ if not htmlb:
+ return edlib.Efail
+ html = htmlb.decode("utf-8", "ignore")
+ p = subprocess.Popen(["/usr/bin/w3m", "-halfdump", "-o", "ext_halfdump=1",
+ "-I", "UTF-8", "-O", "UTF-8",
+ "-o", "display_image=off",
+ "-o", "pre_conv=1",
+ "-cols", "72",
+ "-T", "text/html"],
+ close_fds = True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ stdin=subprocess.PIPE)
+ out,err = p.communicate(html.encode())
+ if err:
+ edlib.LOG("w3m:", err.decode("utf-8","ignore"))
+ if out:
+ doc = focus.call("doc:from-text", "html-document",
+ out.decode("utf-8","ignore"),
+ ret='pane')
+ else:
+ doc = focus.call("doc:from-text", "html-document",
+ err.decode("utf-8","ignore"),
+ ret='pane')
+
+ parse_halfdump(doc)
+ comm2("cb", doc)
+ return 1
+
+def parse_halfdump(doc):
+ # recognise and markup
+ # <[Bb]> .. </b> bold
+ # <a href=....>...</a> anchor
+ # <internal>...</internal> hide
+ # <anything-else> - ignore
+ #
+ # &foo; - replace with one char:
+ # amp - &
+ # rsquo - '
+ # emsp
+ # lt - <
+ # gt - >
+ # #x.... utf-8 hex
+
+ m = edlib.Mark(doc)
+ bold = False; internal = False; imgalt = False; url = None
+ while True:
+ try:
+ if bold or internal or url or imgalt:
+ len = doc.call("text-search", "(^.|<[^>]*>)", m)
+ else:
+ len = doc.call("text-search", "<[^>]*>", m)
+ len -= 1
+ except:
+ break
+ if len == 1:
+ # Found start of line - re-assert things
+ if bold:
+ doc.call("doc:set-attr", 1, m, "render:bold", "1")
+ if internal:
+ doc.call("doc:set-attr", 1, m, "render:internal", "1")
+ if imgalt:
+ doc.call("doc:set-attr", 1, m, "render:imgalt", "1")
+ if urltag:
+ doc.call("doc:set-attr", 1, m, "render:url", urltag)
+ continue
+
+ st = m.dup()
+ i = 0
+ while i < len:
+ doc.prev(st)
+ i += 1
+ doc.call('doc:set-attr', 1, st, "render:hide", "%d" % len)
+
+ tag = doc.call("doc:get-str", st, m, ret='str')
+ tagl = tag.lower()
+ if tagl == "<b>":
+ doc.call("doc:set-attr", 1, m, "render:bold", "1")
+ bold=True
+ elif tagl == "</b>" and bold:
+ doc.call("doc:set-attr", 1, m, "render:bold", "0")
+ bold = False
+ elif tagl == "<internal>":
+ doc.call("doc:set-attr", 1, m, "render:internal", "1")
+ internal = True
+ elif tagl == "</internal>":
+ doc.call("doc:set-attr", 1, m, "render:internal", "0")
+ internal = False
+ elif tagl[:9] == "<img_alt ":
+ doc.call("doc:set-attr", 1, m, "render:imgalt", "1")
+ imgalt = True
+ elif tagl == "</img_alt>":
+ doc.call("doc:set-attr", 1, m, "render:imgalt", "0")
+ imgalt = False
+ elif tagl[:3] == "<a ":
+ url = get_attr(tagl, tag, "href")
+ urltag = get_attr(tagl, tag, "hseq")
+ if not urltag:
+ urltag = doc['next-url-tag']
+ if not urltag:
+ urltag = "1"
+ doc['next-url-tag'] = "%d" % (int(urltag)+1)
+ urltag = "i" + urltag
+ urltag = "w3m-" + urltag
+ if url:
+ doc.call("doc:set-attr", 1, m, "render:url", urltag)
+ doc["url:" + urltag] = url
+ elif tagl == "</a>":
+ doc.call("doc:set-attr", 1, m, "render:url-end", urltag)
+ url = None; urltag = None
+
+ m = edlib.Mark(doc)
+ while True:
+ try:
+ len = doc.call("text-search", "&[#A-Za-z0-9]*;", m)
+ len -= 1
+ except:
+ break
+ st = m.dup()
+ i = 0
+ while i < len:
+ doc.prev(st)
+ i += 1
+ name = doc.call("doc:get-str", st, m, ret='str')
+ char = name[1:-1]
+ if char == "amp":
+ char = "&"
+ elif char == "lt":
+ char = "<<"
+ elif char == "gt":
+ char = ">"
+ elif char[:2] == "#x":
+ char = chr(int(char[2:], 16))
+ elif char[:2] == "#":
+ char = chr(int(char[1:], 10))
+ elif char == "zwnj":
+ char = ""
+ elif char == "emsp":
+ char = " "
+ elif char == "rsquo":
+ char = chr(8217)
+ else:
+ char = "!" + char
+ doc.call('doc:set-attr', 1, st, "render:char", "%d:%s" % (len,char))
+
+if "editor" in globals():
+ editor.call("global-set-command", "html-to-text-w3m", html_to_w3m)
if str == "render:rfc822header-to":
comm2("attr:callback", focus, int(str2), mark, "fg:blue,bold", 120)
return 1
- if str == "render:url":
- c=str2.index(':')
- comm2("attr:callback", focus, int(str2[:c]), mark,
- "fg:cyan-60,underline,active-tag:url,url-tag="+str2[c+1:], 120)
+ if str == "render:hide":
+ comm2("attr:callback", focus, int(str2), mark, "hide", 100000)
+ if str == "render:bold":
+ comm2("attr:callback", focus, 100000 if str2 == "1" else -1,
+ mark, "bold", 120)
+ if str == "render:internal":
+ comm2("attr:callback", focus, 100000 if str2 == "1" else -1,
+ mark, "hide", 120)
+ if str == "render:imgalt":
+ comm2("attr:callback", focus, 100000 if str2 == "1" else -1,
+ mark, "fg:green-60", 120)
+ if str[:10] == "render:url":
+ w = str2.split(':')
+ if len(w) == 2:
+ tg = w[1]
+ leng = int(w[0])
+ else:
+ tg = str2
+ leng = 100000
+ if str == "render:url-end":
+ leng = -1
+ comm2("attr:callback", focus, leng, mark,
+ "fg:cyan-60,underline,active-tag:url,url-tag="+tg, 120)
+ if str == "render:char":
+ w = str2.split(':')
+ attr = None
+ if w[1][0] == '!' and w[1] != '!':
+ # not recognised, so highlight the name
+ attr = "fg:magenta-60,bold"
+ comm2("attr:callback", focus, int(w[0]), mark,
+ attr, 120, w[1])
+ # Don't show the htm entity description, just the rendering.
+ comm2("attr:callback", focus, int(w[0]), mark,
+ "hide", 60000)
if str == 'start-of-line':
m = self.vmark_at_or_before(self.qview, mark)
bg = None