From: NeilBrown <neil@brown.name>
Date: Sun, 28 May 2023 07:58:03 +0000 (+1000)
Subject: pdf,doc: mark urls.
X-Git-Url: http://git.neil.brown.name/?a=commitdiff_plain;h=d4e9ed0c6998b6071692637cfac7daae291ced6f;p=edlib.git

pdf,doc: mark urls.

Copy the url-marking code from notmuch as use it to mark urls in
converted pdf and doc.
I don't know that this is a good idea.. Let's see if I find it useful.

Signed-off-by: NeilBrown <neil@brown.name>
---

diff --git a/DOC/TODO.md b/DOC/TODO.md
index 0344a614..520598c2 100644
--- a/DOC/TODO.md
+++ b/DOC/TODO.md
@@ -88,7 +88,8 @@ Bugs to be fixed
       already marked if needed.  This was fixed by making conversion
       async so:
 - [ ] async email part converts need to do their own URL marking.
-      PDF particularly, and html2md.  Maybe others.
+      PDF particularly, and html2md.  Maybe others. - LATER pdf and doc
+      do it now by copying the code.
 - [X] notmuch: "reply" should clear unread/new flags.
 - [X] transparent images appear in email with horiz lines
 - [X] Replying to w3m/html mail results in unsightly markup in reply
diff --git a/python/lib-doc-to-text.py b/python/lib-doc-to-text.py
index ef93cc1a..c58ceaf5 100644
--- a/python/lib-doc-to-text.py
+++ b/python/lib-doc-to-text.py
@@ -84,9 +84,38 @@ class doc_pane(edlib.Pane):
             self.doc.call("doc:set-ref", m2)
             self.doc.call("doc:replace", 1, out.decode("utf-8", 'ignore'),
                           m, m2)
+            self.mark_urls(self.doc)
         self.close()
         return edlib.Efalse
 
+    def mark_urls(self, doc):
+        ms = edlib.Mark(doc)
+        me = ms.dup()
+        doc.call("doc:set-ref", me)
+        while ms < me:
+            try:
+                len = doc.call("text-search",
+                                "(http|https|ftp|mail):[^][\\s\";<>]+", ms, me)
+                len -= 1
+            except:
+                return
+            # People sometimes put a period or ')' at the end of a URL.
+            while doc.prior(ms) in '.)':
+                doc.prev(ms)
+                len -= 1
+            m1 = ms.dup()
+            i = 0
+            while i < len:
+                doc.prev(m1)
+                i += 1
+            url = doc.call("doc:get-str", m1, ms, ret='str')
+            tag = doc['next-url-tag']
+            if not tag:
+                tag = "1"
+            doc.call("doc:set-attr", 1, m1, "render:url", "%d:%s"%(len,tag))
+            doc['next-url-tag'] = "%d" % (int(tag) + 1)
+            doc["url:" + tag] = url
+
 def doc_to_text(key, home, focus, num, str1, comm2, **a):
 
     if not str1 or '.' not in str1:
diff --git a/python/lib-pdf-to-text.py b/python/lib-pdf-to-text.py
index 9531d319..c3040ab5 100644
--- a/python/lib-pdf-to-text.py
+++ b/python/lib-pdf-to-text.py
@@ -86,10 +86,40 @@ class pdf_pane(edlib.Pane):
         self.pipe = None
         if err:
             edlib.LOG("pdf-to-text", err.decode('utf-8','ignore'))
+        else:
+            self.mark_urls(self.doc)
 
         self.close()
         return edlib.Efalse
 
+    def mark_urls(self, doc):
+        ms = edlib.Mark(doc)
+        me = ms.dup()
+        doc.call("doc:set-ref", me)
+        while ms < me:
+            try:
+                len = doc.call("text-search",
+                                "(http|https|ftp|mail):[^][\\s\";<>]+", ms, me)
+                len -= 1
+            except:
+                return
+            # People sometimes put a period or ')' at the end of a URL.
+            while doc.prior(ms) in '.)':
+                doc.prev(ms)
+                len -= 1
+            m1 = ms.dup()
+            i = 0
+            while i < len:
+                doc.prev(m1)
+                i += 1
+            url = doc.call("doc:get-str", m1, ms, ret='str')
+            tag = doc['next-url-tag']
+            if not tag:
+                tag = "1"
+            doc.call("doc:set-attr", 1, m1, "render:url", "%d:%s"%(len,tag))
+            doc['next-url-tag'] = "%d" % (int(tag) + 1)
+            doc["url:" + tag] = url
+
 def pdf_to_text(key, home, focus, num, comm2, **a):
     pdf = focus.call("doc:get-bytes", ret='bytes')