pdf,doc: mark urls.

author NeilBrown <neil@brown.name>

Sun, 28 May 2023 07:58:03 +0000 (17:58 +1000)

committer NeilBrown <neil@brown.name>

Thu, 8 Jun 2023 10:38:57 +0000 (20:38 +1000)
author NeilBrown <neil@brown.name>
Sun, 28 May 2023 07:58:03 +0000 (17:58 +1000)
committer NeilBrown <neil@brown.name>
Thu, 8 Jun 2023 10:38:57 +0000 (20:38 +1000)
diff --git a/DOC/TODO.md b/DOC/TODO.md

index 0344a61451278c913d336e874d2b9e1b8b2db5ee..520598c2f011e586b92ea59440264c2f97725f0e 100644 (file)
--- a/DOC/TODO.md
+++ b/DOC/TODO.md
@@ -88,7 +88,8 @@ Bugs to be fixed
        already marked if needed.  This was fixed by making conversion
        async so:
  - [ ] async email part converts need to do their own URL marking.
-      PDF particularly, and html2md.  Maybe others.
+      PDF particularly, and html2md.  Maybe others. - LATER pdf and doc
+      do it now by copying the code.
  - [X] notmuch: "reply" should clear unread/new flags.
  - [X] transparent images appear in email with horiz lines
  - [X] Replying to w3m/html mail results in unsightly markup in reply
diff --git a/python/lib-doc-to-text.py b/python/lib-doc-to-text.py

index ef93cc1a4989d2a57abb123d23612189efd18166..c58ceaf5c65e8c4bd1c465dbc44124caeeefc1fd 100644 (file)
--- a/python/lib-doc-to-text.py
+++ b/python/lib-doc-to-text.py
@@ -84,9 +84,38 @@ class doc_pane(edlib.Pane):
              self.doc.call("doc:set-ref", m2)
              self.doc.call("doc:replace", 1, out.decode("utf-8", 'ignore'),
                            m, m2)
+            self.mark_urls(self.doc)
          self.close()
          return edlib.Efalse
  
+    def mark_urls(self, doc):
+        ms = edlib.Mark(doc)
+        me = ms.dup()
+        doc.call("doc:set-ref", me)
+        while ms < me:
+            try:
+                len = doc.call("text-search",
+                                "(http|https|ftp|mail):[^][\\s\";<>]+", ms, me)
+                len -= 1
+            except:
+                return
+            # People sometimes put a period or ')' at the end of a URL.
+            while doc.prior(ms) in '.)':
+                doc.prev(ms)
+                len -= 1
+            m1 = ms.dup()
+            i = 0
+            while i < len:
+                doc.prev(m1)
+                i += 1
+            url = doc.call("doc:get-str", m1, ms, ret='str')
+            tag = doc['next-url-tag']
+            if not tag:
+                tag = "1"
+            doc.call("doc:set-attr", 1, m1, "render:url", "%d:%s"%(len,tag))
+            doc['next-url-tag'] = "%d" % (int(tag) + 1)
+            doc["url:" + tag] = url
+
  def doc_to_text(key, home, focus, num, str1, comm2, **a):
  
      if not str1 or '.' not in str1:
diff --git a/python/lib-pdf-to-text.py b/python/lib-pdf-to-text.py

index 9531d319df7bf7f73bed46f73de7f0eb07005e92..c3040ab5030a0110f862c39f2553754a358f3042 100644 (file)
--- a/python/lib-pdf-to-text.py
+++ b/python/lib-pdf-to-text.py
@@ -86,10 +86,40 @@ class pdf_pane(edlib.Pane):
          self.pipe = None
          if err:
              edlib.LOG("pdf-to-text", err.decode('utf-8','ignore'))
+        else:
+            self.mark_urls(self.doc)
  
          self.close()
          return edlib.Efalse
  
+    def mark_urls(self, doc):
+        ms = edlib.Mark(doc)
+        me = ms.dup()
+        doc.call("doc:set-ref", me)
+        while ms < me:
+            try:
+                len = doc.call("text-search",
+                                "(http|https|ftp|mail):[^][\\s\";<>]+", ms, me)
+                len -= 1
+            except:
+                return
+            # People sometimes put a period or ')' at the end of a URL.
+            while doc.prior(ms) in '.)':
+                doc.prev(ms)
+                len -= 1
+            m1 = ms.dup()
+            i = 0
+            while i < len:
+                doc.prev(m1)
+                i += 1
+            url = doc.call("doc:get-str", m1, ms, ret='str')
+            tag = doc['next-url-tag']
+            if not tag:
+                tag = "1"
+            doc.call("doc:set-attr", 1, m1, "render:url", "%d:%s"%(len,tag))
+            doc['next-url-tag'] = "%d" % (int(tag) + 1)
+            doc["url:" + tag] = url
+
  def pdf_to_text(key, home, focus, num, comm2, **a):
      pdf = focus.call("doc:get-bytes", ret='bytes')
author	NeilBrown <neil@brown.name>
	Sun, 28 May 2023 07:58:03 +0000 (17:58 +1000)
committer	NeilBrown <neil@brown.name>
	Thu, 8 Jun 2023 10:38:57 +0000 (20:38 +1000)
DOC/TODO.md		patch \| blob \| history
python/lib-doc-to-text.py		patch \| blob \| history
python/lib-pdf-to-text.py		patch \| blob \| history