1
0
mirror of https://github.com/searxng/searxng.git synced 2024-11-22 12:10:11 +01:00

Replace chompjs with pure Python code

The new implementation is good enough for the current usage (brave)
This commit is contained in:
Alexandre Flament 2023-09-09 10:18:39 +00:00
parent 8e45ac4271
commit d07c006aed
3 changed files with 75 additions and 3 deletions

View File

@ -17,4 +17,3 @@ markdown-it-py==3.0.0
typing_extensions==4.7.1 typing_extensions==4.7.1
fasttext-predict==0.9.2.1 fasttext-predict==0.9.2.1
pytomlpp==1.0.13 pytomlpp==1.0.13
chompjs==1.2.2

View File

@ -104,7 +104,6 @@ from urllib.parse import (
parse_qs, parse_qs,
) )
import chompjs
from lxml import html from lxml import html
from searx import locales from searx import locales
@ -112,6 +111,7 @@ from searx.utils import (
extract_text, extract_text,
eval_xpath_list, eval_xpath_list,
eval_xpath_getindex, eval_xpath_getindex,
js_variable_to_python,
) )
from searx.enginelib.traits import EngineTraits from searx.enginelib.traits import EngineTraits
@ -215,7 +215,7 @@ def response(resp):
datastr = line.replace("const data = ", "").strip()[:-1] datastr = line.replace("const data = ", "").strip()[:-1]
break break
json_data = chompjs.parse_js_object(datastr) json_data = js_variable_to_python(datastr)
json_resp = json_data[1]['data']['body']['response'] json_resp = json_data[1]['data']['body']['response']
if brave_category == 'news': if brave_category == 'news':

View File

@ -7,6 +7,7 @@
import re import re
import importlib import importlib
import importlib.util import importlib.util
import json
import types import types
from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
@ -37,6 +38,9 @@ _BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) _ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) _ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')
_STORAGE_UNIT_VALUE: Dict[str, int] = { _STORAGE_UNIT_VALUE: Dict[str, int] = {
'TB': 1024 * 1024 * 1024 * 1024, 'TB': 1024 * 1024 * 1024 * 1024,
'GB': 1024 * 1024 * 1024, 'GB': 1024 * 1024 * 1024,
@ -645,3 +649,72 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
return None return None
return language return language
return None return None
def js_variable_to_python(js_variable):
"""Convert a javascript variable into JSON and then load the value
It does not deal with all cases, but it is good enough for now.
chompjs has a better implementation.
"""
# when in_string is not None, it contains the character that has opened the string
# either simple quote or double quote
in_string = None
# cut the string:
# r"""{ a:"f\"irst", c:'sec"ond'}"""
# becomes
# ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
parts = re.split(r'(["\'])', js_variable)
# previous part (to check the escape character antislash)
previous_p = ""
for i, p in enumerate(parts):
# parse characters inside a ECMA string
if in_string:
# we are in a JS string: replace the colon by a temporary character
# so quote_keys_regex doesn't have to deal with colon inside the JS strings
parts[i] = parts[i].replace(':', chr(1))
if in_string == "'":
# the JS string is delimited by simple quote.
# This is not supported by JSON.
# simple quote delimited string are converted to double quote delimited string
# here, inside a JS string, we escape the double quote
parts[i] = parts[i].replace('"', r'\"')
# deal with delimieters and escape character
if not in_string and p in ('"', "'"):
# we are not in string
# but p is double or simple quote
# that's the start of a new string
# replace simple quote by double quote
# (JSON doesn't support simple quote)
parts[i] = '"'
in_string = p
continue
if p == in_string:
# we are in a string and the current part MAY close the string
if len(previous_p) > 0 and previous_p[-1] == '\\':
# there is an antislash just before: the ECMA string continue
continue
# the current p close the string
# replace simple quote by double quote
parts[i] = '"'
in_string = None
#
if not in_string:
# replace void 0 by null
# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
# we are sure there is no string in p
parts[i] = _JS_VOID_RE.sub("null", p)
# update previous_p
previous_p = p
# join the string
s = ''.join(parts)
# add quote arround the key
# { a: 12 }
# becomes
# { "a": 12 }
s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
# replace the surogate character by colon
s = s.replace(chr(1), ':')
# load the JSON and return the result
return json.loads(s)