python: handle unterminated strings from doc-text

author NeilBrown <neil@brown.name>

Sat, 27 May 2023 00:37:51 +0000 (10:37 +1000)

committer NeilBrown <neil@brown.name>

Sat, 27 May 2023 00:37:51 +0000 (10:37 +1000)
author NeilBrown <neil@brown.name>
Sat, 27 May 2023 00:37:51 +0000 (10:37 +1000)
committer NeilBrown <neil@brown.name>
Sat, 27 May 2023 00:37:51 +0000 (10:37 +1000)
diff --git a/DOC/TODO.md b/DOC/TODO.md

index 3db286bfd38d5dba5a4de5c244db3219bea6b9f4..4ead65e486064c976f24be6059ec99fb462a04ca 100644 (file)
--- a/DOC/TODO.md
+++ b/DOC/TODO.md
@@ -59,6 +59,16 @@ Bugs to be fixed
  - [ ] I cannot dup a mark in a mark:moving handler.  That is too
        restrictive.  I need a different way to decide that incoming marks
        get notified.
+- [X] doc:text passes an unterminated string to "content" for
+      doc:content.  num2 is the length.  python doesn't know this
+      and so tries to convert it all to utf8.  This is wasteful as
+      we might not want that much.  How can I tell an interpreter
+      that num2 is the length of str1?
+      - extra flag in cmd_info
+      - special key
+      - annotation on called function?
+      Probably a special key  ending " unterminated"
+
  - [X] when map-attr returns text to be inserted, check for '<' and
        double them
  - [ ] when w3m text is copied we get the markup.  I find this useful,
diff --git a/doc-text.c b/doc-text.c

index 1bf8ef4117ffe8372496e5abe3b5e6171ac1e06e..c18a427f3dc645043f8f4314dcc9ee419253b627 100644 (file)
--- a/doc-text.c
+++ b/doc-text.c
@@ -1938,7 +1938,11 @@ DEF_CMD(text_content)
                         text_normalize(t, &m->ref);
  
                         ln -= s - ss;
-                       rv = comm_call(ci->comm2, "consume", ci->focus,
+                       /* Interpreted can see " unterminated" and know
+                        * than ->num2 is the length of ->str
+                        */
+                       rv = comm_call(ci->comm2, "consume unterminated",
+                                      ci->focus,
                                        wc, m, s, ln, NULL, NULL, size, 0);
                         size = 0;
                         if (rv <= 0 || rv > ln + 1) {
diff --git a/lang-python.c b/lang-python.c

index 48991f1a2edb71dd8d3f23d056fc6240c7f4c68a..070a3a8cbc75146f62156289bb4d586370c1cae2 100644 (file)
--- a/lang-python.c
+++ b/lang-python.c
@@ -391,16 +391,21 @@ DEF_CMD(python_load_module)
         return 1;
  }
  
-static PyObject *safe python_string(const char *s safe)
+static PyObject *safe python_string(const char *s safe, int len)
  {
         const char *c = s;
-       while (*c && !(*c & 0x80))
+       const char *e = NULL;
+       wint_t wch;
+
+       if (len >= 0)
+               e = s + len;
+       while ((!e || c < e) && *c && !(*c & 0x80))
                 c++;
-       if (*c && utf8_valid(c))
-               /* must be Unicode */
-               return safe_cast PyUnicode_DecodeUTF8(s, strlen(s), NULL);
-       else
-               return safe_cast Py_BuildValue("s", s);
+       while ((wch = get_utf8(&c, e)) != WEOF)
+               if (wch == WERR || wch > 0x10FFFF)
+                       break;
+
+       return safe_cast PyUnicode_DecodeUTF8(s, c - s, NULL);
  }
  
  static char *python_as_string(PyObject *s, PyObject **tofree safe)
@@ -443,6 +448,12 @@ REDEF_CB(python_call)
         struct python_command *pc = container_of(ci->comm, struct python_command, c);
         PyObject *ret = NULL, *args, *kwds, *str;
         int rv = 1;
+       bool unterminated = False;
+       int klen = strlen(ci->key);
+
+       if (klen > 13 &&
+           strcmp(ci->key + klen - 13, " unterminated") == 0)
+               unterminated = True;
  
         args = safe_cast Py_BuildValue("(s)", ci->key);
         kwds = PyDict_New();
@@ -457,7 +468,7 @@ REDEF_CB(python_call)
                             (Py_INCREF(Py_None), Py_None));
  
         if (ci->str)
-               str = python_string(ci->str);
+               str = python_string(ci->str, unterminated ? ci->num2 : -1);
         else {
                 str = Py_None;
                 Py_INCREF(Py_None);
@@ -471,7 +482,7 @@ REDEF_CB(python_call)
         }
  
         rv = rv && dict_add(kwds, "str2",
-                           ci->str2 ? python_string(ci->str2):
+                           ci->str2 ? python_string(ci->str2, -1):
                             (Py_INCREF(Py_None), safe_cast Py_None));
         rv = rv && dict_add(kwds, "comm", Comm_Fromcomm(ci->comm));
         rv = rv && dict_add(kwds, "comm2",
@@ -1004,7 +1015,7 @@ DEF_CB(take_str)
                 return Einval;
         if (!ci->str)
                 return Efallthrough;
-       pr->ret = python_string(ci->str);
+       pr->ret = python_string(ci->str, -1);
         return 1;
  }
author	NeilBrown <neil@brown.name>
	Sat, 27 May 2023 00:37:51 +0000 (10:37 +1000)
committer	NeilBrown <neil@brown.name>
	Sat, 27 May 2023 00:37:51 +0000 (10:37 +1000)
DOC/TODO.md		patch \| blob \| history
doc-text.c		patch \| blob \| history
lang-python.c		patch \| blob \| history