From: NeilBrown Date: Sun, 28 May 2023 07:58:03 +0000 (+1000) Subject: pdf,doc: mark urls. X-Git-Url: http://git.neil.brown.name/?a=commitdiff_plain;h=d4e9ed0c6998b6071692637cfac7daae291ced6f;p=edlib.git pdf,doc: mark urls. Copy the url-marking code from notmuch as use it to mark urls in converted pdf and doc. I don't know that this is a good idea.. Let's see if I find it useful. Signed-off-by: NeilBrown --- diff --git a/DOC/TODO.md b/DOC/TODO.md index 0344a614..520598c2 100644 --- a/DOC/TODO.md +++ b/DOC/TODO.md @@ -88,7 +88,8 @@ Bugs to be fixed already marked if needed. This was fixed by making conversion async so: - [ ] async email part converts need to do their own URL marking. - PDF particularly, and html2md. Maybe others. + PDF particularly, and html2md. Maybe others. - LATER pdf and doc + do it now by copying the code. - [X] notmuch: "reply" should clear unread/new flags. - [X] transparent images appear in email with horiz lines - [X] Replying to w3m/html mail results in unsightly markup in reply diff --git a/python/lib-doc-to-text.py b/python/lib-doc-to-text.py index ef93cc1a..c58ceaf5 100644 --- a/python/lib-doc-to-text.py +++ b/python/lib-doc-to-text.py @@ -84,9 +84,38 @@ class doc_pane(edlib.Pane): self.doc.call("doc:set-ref", m2) self.doc.call("doc:replace", 1, out.decode("utf-8", 'ignore'), m, m2) + self.mark_urls(self.doc) self.close() return edlib.Efalse + def mark_urls(self, doc): + ms = edlib.Mark(doc) + me = ms.dup() + doc.call("doc:set-ref", me) + while ms < me: + try: + len = doc.call("text-search", + "(http|https|ftp|mail):[^][\\s\";<>]+", ms, me) + len -= 1 + except: + return + # People sometimes put a period or ')' at the end of a URL. + while doc.prior(ms) in '.)': + doc.prev(ms) + len -= 1 + m1 = ms.dup() + i = 0 + while i < len: + doc.prev(m1) + i += 1 + url = doc.call("doc:get-str", m1, ms, ret='str') + tag = doc['next-url-tag'] + if not tag: + tag = "1" + doc.call("doc:set-attr", 1, m1, "render:url", "%d:%s"%(len,tag)) + doc['next-url-tag'] = "%d" % (int(tag) + 1) + doc["url:" + tag] = url + def doc_to_text(key, home, focus, num, str1, comm2, **a): if not str1 or '.' not in str1: diff --git a/python/lib-pdf-to-text.py b/python/lib-pdf-to-text.py index 9531d319..c3040ab5 100644 --- a/python/lib-pdf-to-text.py +++ b/python/lib-pdf-to-text.py @@ -86,10 +86,40 @@ class pdf_pane(edlib.Pane): self.pipe = None if err: edlib.LOG("pdf-to-text", err.decode('utf-8','ignore')) + else: + self.mark_urls(self.doc) self.close() return edlib.Efalse + def mark_urls(self, doc): + ms = edlib.Mark(doc) + me = ms.dup() + doc.call("doc:set-ref", me) + while ms < me: + try: + len = doc.call("text-search", + "(http|https|ftp|mail):[^][\\s\";<>]+", ms, me) + len -= 1 + except: + return + # People sometimes put a period or ')' at the end of a URL. + while doc.prior(ms) in '.)': + doc.prev(ms) + len -= 1 + m1 = ms.dup() + i = 0 + while i < len: + doc.prev(m1) + i += 1 + url = doc.call("doc:get-str", m1, ms, ret='str') + tag = doc['next-url-tag'] + if not tag: + tag = "1" + doc.call("doc:set-attr", 1, m1, "render:url", "%d:%s"%(len,tag)) + doc['next-url-tag'] = "%d" % (int(tag) + 1) + doc["url:" + tag] = url + def pdf_to_text(key, home, focus, num, comm2, **a): pdf = focus.call("doc:get-bytes", ret='bytes')