Replace chompjs with pure Python code

The new implementation is good enough for the current usage (brave)
2023-09-09 10:18:39 +00:00 · 2023-09-09 10:18:39 +00:00 · d4db69b69b
commit d4db69b69b
parent 33065a619e
3 changed files with 75 additions and 3 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -17,4 +17,3 @@ markdown-it-py==3.0.0
 typing_extensions==4.7.1
 fasttext-predict==0.9.2.1
 pytomlpp==1.0.13
-chompjs==1.2.2
--- a/searx/engines/brave.py
+++ b/searx/engines/brave.py
@ -104,7 +104,6 @@ from urllib.parse import (
    parse_qs,
 )

-import chompjs
 from lxml import html

 from searx import locales
@ -112,6 +111,7 @@ from searx.utils import (
    extract_text,
    eval_xpath_list,
    eval_xpath_getindex,
+    js_variable_to_python,
 )
 from searx.enginelib.traits import EngineTraits

@ -215,7 +215,7 @@ def response(resp):
            datastr = line.replace("const data = ", "").strip()[:-1]
            break

-    json_data = chompjs.parse_js_object(datastr)
+    json_data = js_variable_to_python(datastr)
    json_resp = json_data[1]['data']['body']['response']

    if brave_category == 'news':
--- a/searx/utils.py
+++ b/searx/utils.py
@ -7,6 +7,7 @@
 import re
 import importlib
 import importlib.util
+import json
 import types

 from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
@ -37,6 +38,9 @@ _BLOCKED_TAGS = ('script', 'style')
 _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)

+_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
+_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
+
 _STORAGE_UNIT_VALUE: Dict[str, int] = {
    'TB': 1024 * 1024 * 1024 * 1024,
    'GB': 1024 * 1024 * 1024,
@ -645,3 +649,72 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
            return None
        return language
    return None
+
+
+def js_variable_to_python(js_variable):
+    """Convert a javascript variable into JSON and then load the value
+
+    It does not deal with all cases, but it is good enough for now.
+    chompjs has a better implementation.
+    """
+    # when in_string is not None, it contains the character that has opened the string
+    # either simple quote or double quote
+    in_string = None
+    # cut the string:
+    # r"""{ a:"f\"irst", c:'sec"ond'}"""
+    # becomes
+    # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
+    parts = re.split(r'(["\'])', js_variable)
+    # previous part (to check the escape character antislash)
+    previous_p = ""
+    for i, p in enumerate(parts):
+        # parse characters inside a ECMA string
+        if in_string:
+            # we are in a JS string: replace the colon by a temporary character
+            # so quote_keys_regex doesn't have to deal with colon inside the JS strings
+            parts[i] = parts[i].replace(':', chr(1))
+            if in_string == "'":
+                # the JS string is delimited by simple quote.
+                # This is not supported by JSON.
+                # simple quote delimited string are converted to double quote delimited string
+                # here, inside a JS string, we escape the double quote
+                parts[i] = parts[i].replace('"', r'\"')
+
+        # deal with delimieters and escape character
+        if not in_string and p in ('"', "'"):
+            # we are not in string
+            # but p is double or simple quote
+            # that's the start of a new string
+            # replace simple quote by double quote
+            # (JSON doesn't support simple quote)
+            parts[i] = '"'
+            in_string = p
+            continue
+        if p == in_string:
+            # we are in a string and the current part MAY close the string
+            if len(previous_p) > 0 and previous_p[-1] == '\\':
+                # there is an antislash just before: the ECMA string continue
+                continue
+            # the current p close the string
+            # replace simple quote by double quote
+            parts[i] = '"'
+            in_string = None
+        #
+        if not in_string:
+            # replace void 0 by null
+            # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
+            # we are sure there is no string in p
+            parts[i] = _JS_VOID_RE.sub("null", p)
+        # update previous_p
+        previous_p = p
+    # join the string
+    s = ''.join(parts)
+    # add quote arround the key
+    # { a: 12 }
+    # becomes
+    # { "a": 12 }
+    s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
+    # replace the surogate character by colon
+    s = s.replace(chr(1), ':')
+    # load the JSON and return the result
+    return json.loads(s)