w3m: only mark up html entities between tag.

author NeilBrown <neil@brown.name>

Thu, 11 May 2023 01:02:08 +0000 (11:02 +1000)

committer NeilBrown <neil@brown.name>

Sat, 20 May 2023 00:10:49 +0000 (10:10 +1000)
author NeilBrown <neil@brown.name>
Thu, 11 May 2023 01:02:08 +0000 (11:02 +1000)
committer NeilBrown <neil@brown.name>
Sat, 20 May 2023 00:10:49 +0000 (10:10 +1000)
diff --git a/DOC/TODO.md b/DOC/TODO.md

index abc9a2167cd071d625e9b6faaec69f99123bef18..f44305ca5d9233f47a04c0962b720651aa9a0452 100644 (file)
--- a/DOC/TODO.md
+++ b/DOC/TODO.md
@@ -28,6 +28,11 @@ Current priorities
  Bugs to be fixed
  ----------------
  
+- [X] w3m: &amp; in URLs confuse rendering
+- [ ] line break in <a hseq="2"\nhref=.....   isn't handled.
+- [ ] utf-8 output from w3m gets displayed a bytes!!
+- [ ] email: when alternative/1 is related and text/html, is isn't
+       displayed by default.
  - [ ] email: when a convertion pane is created on a 'text' component
        it should be given utf-8 and not have to use "bytes" like html does.
  - [ ] renderline *knows* about scaling and when it places the cursor
diff --git a/python/lib-html-w3m.py b/python/lib-html-w3m.py

index 8a0fe82165d7339c423ef7b7957bfafe8b2a0a8a..87e255be61a9479dee87377b5e1ba79c397fd59b 100644 (file)
--- a/python/lib-html-w3m.py
+++ b/python/lib-html-w3m.py
@@ -322,6 +322,7 @@ def parse_halfdump(doc):
      m = edlib.Mark(doc)
      bold = False; internal = False; imgalt = False; url = None
      while True:
+        prev_end = m.dup()
          try:
              if bold or internal or url or imgalt:
                  len = doc.call("text-search", "(^.|<[^>]*>)", m)
@@ -330,6 +331,7 @@ def parse_halfdump(doc):
              len -= 1
          except:
              break
+
          if len == 1:
              # Found start of line - re-assert things
              if bold:
@@ -349,6 +351,9 @@ def parse_halfdump(doc):
              i += 1
          doc.call('doc:set-attr', 1, st, "render:hide", "%d" % len)
  
+        # We only parse entities between tags, not within them
+        parse_entities(doc, prev_end, st)
+
          tag = doc.call("doc:get-str", st, m, ret='str')
          tagl = tag.lower()
          if tagl == "<b>":
@@ -386,10 +391,11 @@ def parse_halfdump(doc):
              doc.call("doc:set-attr", 1, m, "render:url-end", urltag)
              url = None; urltag = None
  
-    m = edlib.Mark(doc)
+def parse_entities(doc, m, end):
      while True:
+        edlib.LOG("e", m, end)
          try:
-            len = doc.call("text-search", "&[#A-Za-z0-9]*;", m)
+            len = doc.call("text-search", "&[#A-Za-z0-9]*;", m, end)
              len -= 1
          except:
              break
author	NeilBrown <neil@brown.name>
	Thu, 11 May 2023 01:02:08 +0000 (11:02 +1000)
committer	NeilBrown <neil@brown.name>
	Sat, 20 May 2023 00:10:49 +0000 (10:10 +1000)
DOC/TODO.md		patch \| blob \| history
python/lib-html-w3m.py		patch \| blob \| history