From d4db69b69ba3bc01b0be8076164cc461425f25ea Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Sat, 9 Sep 2023 10:18:39 +0000 Subject: [PATCH] Replace chompjs with pure Python code The new implementation is good enough for the current usage (brave) --- requirements.txt | 1 - searx/engines/brave.py | 4 +-- searx/utils.py | 73 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6088f6338..a88f7adc0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,3 @@ markdown-it-py==3.0.0 typing_extensions==4.7.1 fasttext-predict==0.9.2.1 pytomlpp==1.0.13 -chompjs==1.2.2 \ No newline at end of file diff --git a/searx/engines/brave.py b/searx/engines/brave.py index d5e2d3e9f..e9d4af762 100644 --- a/searx/engines/brave.py +++ b/searx/engines/brave.py @@ -104,7 +104,6 @@ from urllib.parse import ( parse_qs, ) -import chompjs from lxml import html from searx import locales @@ -112,6 +111,7 @@ from searx.utils import ( extract_text, eval_xpath_list, eval_xpath_getindex, + js_variable_to_python, ) from searx.enginelib.traits import EngineTraits @@ -215,7 +215,7 @@ def response(resp): datastr = line.replace("const data = ", "").strip()[:-1] break - json_data = chompjs.parse_js_object(datastr) + json_data = js_variable_to_python(datastr) json_resp = json_data[1]['data']['body']['response'] if brave_category == 'news': diff --git a/searx/utils.py b/searx/utils.py index 7ddd2305a..9457ca87d 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -7,6 +7,7 @@ import re import importlib import importlib.util +import json import types from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable @@ -37,6 +38,9 @@ _BLOCKED_TAGS = ('script', 'style') _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) +_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)') +_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)') + _STORAGE_UNIT_VALUE: Dict[str, int] = { 'TB': 1024 * 1024 * 1024 * 1024, 'GB': 1024 * 1024 * 1024, @@ -645,3 +649,72 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo return None return language return None + + +def js_variable_to_python(js_variable): + """Convert a javascript variable into JSON and then load the value + + It does not deal with all cases, but it is good enough for now. + chompjs has a better implementation. + """ + # when in_string is not None, it contains the character that has opened the string + # either simple quote or double quote + in_string = None + # cut the string: + # r"""{ a:"f\"irst", c:'sec"ond'}""" + # becomes + # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}'] + parts = re.split(r'(["\'])', js_variable) + # previous part (to check the escape character antislash) + previous_p = "" + for i, p in enumerate(parts): + # parse characters inside a ECMA string + if in_string: + # we are in a JS string: replace the colon by a temporary character + # so quote_keys_regex doesn't have to deal with colon inside the JS strings + parts[i] = parts[i].replace(':', chr(1)) + if in_string == "'": + # the JS string is delimited by simple quote. + # This is not supported by JSON. + # simple quote delimited string are converted to double quote delimited string + # here, inside a JS string, we escape the double quote + parts[i] = parts[i].replace('"', r'\"') + + # deal with delimieters and escape character + if not in_string and p in ('"', "'"): + # we are not in string + # but p is double or simple quote + # that's the start of a new string + # replace simple quote by double quote + # (JSON doesn't support simple quote) + parts[i] = '"' + in_string = p + continue + if p == in_string: + # we are in a string and the current part MAY close the string + if len(previous_p) > 0 and previous_p[-1] == '\\': + # there is an antislash just before: the ECMA string continue + continue + # the current p close the string + # replace simple quote by double quote + parts[i] = '"' + in_string = None + # + if not in_string: + # replace void 0 by null + # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void + # we are sure there is no string in p + parts[i] = _JS_VOID_RE.sub("null", p) + # update previous_p + previous_p = p + # join the string + s = ''.join(parts) + # add quote arround the key + # { a: 12 } + # becomes + # { "a": 12 } + s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s) + # replace the surogate character by colon + s = s.replace(chr(1), ':') + # load the JSON and return the result + return json.loads(s)