forked from zaclys/searxng
		
	[mod] add documentation about searx.utils
This module is a toolbox for the engines. Is should be documented. In addition, searx/utils.py is checked by pylint.
This commit is contained in:
		
							parent
							
								
									f240a67bd7
								
							
						
					
					
						commit
						0eacc46ee3
					
				
					 3 changed files with 104 additions and 100 deletions
				
			
		
							
								
								
									
										8
									
								
								docs/src/searx.utils.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								docs/src/searx.utils.rst
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,8 @@
 | 
			
		|||
.. _searx.utils:
 | 
			
		||||
 | 
			
		||||
=================================
 | 
			
		||||
Utility functions for the engines
 | 
			
		||||
=================================
 | 
			
		||||
 | 
			
		||||
.. automodule:: searx.utils
 | 
			
		||||
  :members:
 | 
			
		||||
							
								
								
									
										192
									
								
								searx/utils.py
									
										
									
									
									
								
							
							
						
						
									
										192
									
								
								searx/utils.py
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -1,7 +1,13 @@
 | 
			
		|||
# -*- coding: utf-8 -*-
 | 
			
		||||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
			
		||||
# lint: pylint
 | 
			
		||||
# pyright: basic
 | 
			
		||||
"""Utility functions for the engines
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
import re
 | 
			
		||||
import importlib
 | 
			
		||||
 | 
			
		||||
from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
 | 
			
		||||
from numbers import Number
 | 
			
		||||
from os.path import splitext, join
 | 
			
		||||
from random import choice
 | 
			
		||||
| 
						 | 
				
			
			@ -23,42 +29,56 @@ from searx import logger
 | 
			
		|||
 | 
			
		||||
logger = logger.getChild('utils')
 | 
			
		||||
 | 
			
		||||
blocked_tags = ('script', 'style')
 | 
			
		||||
XPathSpecType = Union[str, XPath]
 | 
			
		||||
 | 
			
		||||
ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 | 
			
		||||
ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
 | 
			
		||||
_BLOCKED_TAGS = ('script', 'style')
 | 
			
		||||
 | 
			
		||||
xpath_cache = dict()
 | 
			
		||||
lang_to_lc_cache = dict()
 | 
			
		||||
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
 | 
			
		||||
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
 | 
			
		||||
 | 
			
		||||
_STORAGE_UNIT_VALUE: Dict[str, int] = {
 | 
			
		||||
    'TB': 1024 * 1024 * 1024 * 1024,
 | 
			
		||||
    'GB': 1024 * 1024 * 1024,
 | 
			
		||||
    'MB': 1024 * 1024,
 | 
			
		||||
    'TiB': 1000 * 1000 * 1000 * 1000,
 | 
			
		||||
    'MiB': 1000 * 1000,
 | 
			
		||||
    'KiB': 1000,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
_XPATH_CACHE = {}
 | 
			
		||||
_LANG_TO_LC_CACHE = {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NotSetClass:
 | 
			
		||||
    pass
 | 
			
		||||
class _NotSetClass:  # pylint: disable=too-few-public-methods
 | 
			
		||||
    """Internal class for this module, do not create instance of this class.
 | 
			
		||||
    Replace the None value, allow explicitly pass None as a function argument"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NOTSET = NotSetClass()
 | 
			
		||||
_NOTSET = _NotSetClass()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def searx_useragent():
 | 
			
		||||
def searx_useragent() -> str:
 | 
			
		||||
    """Return the searx User Agent"""
 | 
			
		||||
    return 'searx/{searx_version} {suffix}'.format(
 | 
			
		||||
        searx_version=VERSION_TAG, suffix=settings['outgoing']['useragent_suffix']
 | 
			
		||||
    ).strip()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def gen_useragent(os=None):
 | 
			
		||||
def gen_useragent(os_string: str = None) -> str:
 | 
			
		||||
    """Return a random browser User Agent
 | 
			
		||||
 | 
			
		||||
    See searx/data/useragents.json
 | 
			
		||||
    """
 | 
			
		||||
    return str(USER_AGENTS['ua'].format(os=os or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions'])))
 | 
			
		||||
    return USER_AGENTS['ua'].format(os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HTMLTextExtractorException(Exception):
 | 
			
		||||
    pass
 | 
			
		||||
class _HTMLTextExtractorException(Exception):
 | 
			
		||||
    """Internal exception raised when the HTML is invalid"""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bugs.python.org/issue31844)
 | 
			
		||||
class _HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bugs.python.org/issue31844)
 | 
			
		||||
    """Internal class to extract text from HTML"""
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        HTMLParser.__init__(self)
 | 
			
		||||
        self.result = []
 | 
			
		||||
| 
						 | 
				
			
			@ -72,12 +92,12 @@ class HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bu
 | 
			
		|||
            return
 | 
			
		||||
 | 
			
		||||
        if tag != self.tags[-1]:
 | 
			
		||||
            raise HTMLTextExtractorException()
 | 
			
		||||
            raise _HTMLTextExtractorException()
 | 
			
		||||
 | 
			
		||||
        self.tags.pop()
 | 
			
		||||
 | 
			
		||||
    def is_valid_tag(self):
 | 
			
		||||
        return not self.tags or self.tags[-1] not in blocked_tags
 | 
			
		||||
        return not self.tags or self.tags[-1] not in _BLOCKED_TAGS
 | 
			
		||||
 | 
			
		||||
    def handle_data(self, data):
 | 
			
		||||
        if not self.is_valid_tag():
 | 
			
		||||
| 
						 | 
				
			
			@ -104,7 +124,7 @@ class HTMLTextExtractor(HTMLParser):  # pylint: disable=W0223  # (see https://bu
 | 
			
		|||
        return ''.join(self.result).strip()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def html_to_text(html_str):
 | 
			
		||||
def html_to_text(html_str: str) -> str:
 | 
			
		||||
    """Extract text from a HTML string
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
| 
						 | 
				
			
			@ -122,15 +142,15 @@ def html_to_text(html_str):
 | 
			
		|||
    """
 | 
			
		||||
    html_str = html_str.replace('\n', ' ')
 | 
			
		||||
    html_str = ' '.join(html_str.split())
 | 
			
		||||
    s = HTMLTextExtractor()
 | 
			
		||||
    s = _HTMLTextExtractor()
 | 
			
		||||
    try:
 | 
			
		||||
        s.feed(html_str)
 | 
			
		||||
    except HTMLTextExtractorException:
 | 
			
		||||
    except _HTMLTextExtractorException:
 | 
			
		||||
        logger.debug("HTMLTextExtractor: invalid HTML\n%s", html_str)
 | 
			
		||||
    return s.get_text()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def extract_text(xpath_results, allow_none=False):
 | 
			
		||||
def extract_text(xpath_results, allow_none: bool = False):
 | 
			
		||||
    """Extract text from a lxml result
 | 
			
		||||
 | 
			
		||||
    * if xpath_results is list, extract the text from each result and concat the list
 | 
			
		||||
| 
						 | 
				
			
			@ -142,24 +162,23 @@ def extract_text(xpath_results, allow_none=False):
 | 
			
		|||
        # it's list of result : concat everything using recursive call
 | 
			
		||||
        result = ''
 | 
			
		||||
        for e in xpath_results:
 | 
			
		||||
            result = result + extract_text(e)
 | 
			
		||||
            result = result + (extract_text(e) or '')
 | 
			
		||||
        return result.strip()
 | 
			
		||||
    elif isinstance(xpath_results, ElementBase):
 | 
			
		||||
    if isinstance(xpath_results, ElementBase):
 | 
			
		||||
        # it's a element
 | 
			
		||||
        text = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
 | 
			
		||||
        text: str = html.tostring(xpath_results, encoding='unicode', method='text', with_tail=False)
 | 
			
		||||
        text = text.strip().replace('\n', ' ')
 | 
			
		||||
        return ' '.join(text.split())
 | 
			
		||||
    elif isinstance(xpath_results, (_ElementStringResult, _ElementUnicodeResult, str, Number, bool)):
 | 
			
		||||
    if isinstance(xpath_results, (_ElementStringResult, _ElementUnicodeResult, str, Number, bool)):
 | 
			
		||||
        return str(xpath_results)
 | 
			
		||||
    elif xpath_results is None and allow_none:
 | 
			
		||||
    if xpath_results is None and allow_none:
 | 
			
		||||
        return None
 | 
			
		||||
    elif xpath_results is None and not allow_none:
 | 
			
		||||
    if xpath_results is None and not allow_none:
 | 
			
		||||
        raise ValueError('extract_text(None, allow_none=False)')
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError('unsupported type')
 | 
			
		||||
    raise ValueError('unsupported type')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def normalize_url(url, base_url):
 | 
			
		||||
def normalize_url(url: str, base_url: str) -> str:
 | 
			
		||||
    """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
| 
						 | 
				
			
			@ -209,7 +228,7 @@ def normalize_url(url, base_url):
 | 
			
		|||
    return url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def extract_url(xpath_results, base_url):
 | 
			
		||||
def extract_url(xpath_results, base_url) -> str:
 | 
			
		||||
    """Extract and normalize URL from lxml Element
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
| 
						 | 
				
			
			@ -248,7 +267,7 @@ def extract_url(xpath_results, base_url):
 | 
			
		|||
    return normalize_url(url, base_url)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def dict_subset(d, properties):
 | 
			
		||||
def dict_subset(dictionnary: MutableMapping, properties: Set[str]) -> Dict:
 | 
			
		||||
    """Extract a subset of a dict
 | 
			
		||||
 | 
			
		||||
    Examples:
 | 
			
		||||
| 
						 | 
				
			
			@ -257,10 +276,10 @@ def dict_subset(d, properties):
 | 
			
		|||
        >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
 | 
			
		||||
        {'A': 'a'}
 | 
			
		||||
    """
 | 
			
		||||
    return {k: d[k] for k in properties if k in d}
 | 
			
		||||
    return {k: dictionnary[k] for k in properties if k in dictionnary}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_torrent_size(filesize, filesize_multiplier):
 | 
			
		||||
def get_torrent_size(filesize: str, filesize_multiplier: str) -> Optional[int]:
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
| 
						 | 
				
			
			@ -277,39 +296,20 @@ def get_torrent_size(filesize, filesize_multiplier):
 | 
			
		|||
        3140000
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        filesize = float(filesize)
 | 
			
		||||
 | 
			
		||||
        if filesize_multiplier == 'TB':
 | 
			
		||||
            filesize = int(filesize * 1024 * 1024 * 1024 * 1024)
 | 
			
		||||
        elif filesize_multiplier == 'GB':
 | 
			
		||||
            filesize = int(filesize * 1024 * 1024 * 1024)
 | 
			
		||||
        elif filesize_multiplier == 'MB':
 | 
			
		||||
            filesize = int(filesize * 1024 * 1024)
 | 
			
		||||
        elif filesize_multiplier == 'KB':
 | 
			
		||||
            filesize = int(filesize * 1024)
 | 
			
		||||
        elif filesize_multiplier == 'TiB':
 | 
			
		||||
            filesize = int(filesize * 1000 * 1000 * 1000 * 1000)
 | 
			
		||||
        elif filesize_multiplier == 'GiB':
 | 
			
		||||
            filesize = int(filesize * 1000 * 1000 * 1000)
 | 
			
		||||
        elif filesize_multiplier == 'MiB':
 | 
			
		||||
            filesize = int(filesize * 1000 * 1000)
 | 
			
		||||
        elif filesize_multiplier == 'KiB':
 | 
			
		||||
            filesize = int(filesize * 1000)
 | 
			
		||||
        multiplier = _STORAGE_UNIT_VALUE.get(filesize_multiplier, 1)
 | 
			
		||||
        return int(float(filesize) * multiplier)
 | 
			
		||||
    except ValueError:
 | 
			
		||||
        filesize = None
 | 
			
		||||
 | 
			
		||||
    return filesize
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def convert_str_to_int(number_str):
 | 
			
		||||
def convert_str_to_int(number_str: str) -> int:
 | 
			
		||||
    """Convert number_str to int or 0 if number_str is not a number."""
 | 
			
		||||
    if number_str.isdigit():
 | 
			
		||||
        return int(number_str)
 | 
			
		||||
    else:
 | 
			
		||||
        return 0
 | 
			
		||||
    return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def int_or_zero(num):
 | 
			
		||||
def int_or_zero(num: Union[List[str], str]) -> int:
 | 
			
		||||
    """Convert num to int or 0. num can be either a str or a list.
 | 
			
		||||
    If num is a list, the first element is converted to int (or return 0 if the list is empty).
 | 
			
		||||
    If num is a str, see convert_str_to_int
 | 
			
		||||
| 
						 | 
				
			
			@ -321,12 +321,12 @@ def int_or_zero(num):
 | 
			
		|||
    return convert_str_to_int(num)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_valid_lang(lang):
 | 
			
		||||
def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
 | 
			
		||||
    """Return language code and name if lang describe a language.
 | 
			
		||||
 | 
			
		||||
    Examples:
 | 
			
		||||
        >>> is_valid_lang('zz')
 | 
			
		||||
        False
 | 
			
		||||
        None
 | 
			
		||||
        >>> is_valid_lang('uk')
 | 
			
		||||
        (True, 'uk', 'ukrainian')
 | 
			
		||||
        >>> is_valid_lang(b'uk')
 | 
			
		||||
| 
						 | 
				
			
			@ -346,28 +346,27 @@ def is_valid_lang(lang):
 | 
			
		|||
        for l in language_codes:
 | 
			
		||||
            if l[0][:2] == lang:
 | 
			
		||||
                return (True, l[0][:2], l[3].lower())
 | 
			
		||||
        return False
 | 
			
		||||
    else:
 | 
			
		||||
        for l in language_codes:
 | 
			
		||||
            if l[1].lower() == lang or l[3].lower() == lang:
 | 
			
		||||
                return (True, l[0][:2], l[3].lower())
 | 
			
		||||
        return False
 | 
			
		||||
        return None
 | 
			
		||||
    for l in language_codes:
 | 
			
		||||
        if l[1].lower() == lang or l[3].lower() == lang:
 | 
			
		||||
            return (True, l[0][:2], l[3].lower())
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_lang_to_lc_dict(lang_list):
 | 
			
		||||
def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]:
 | 
			
		||||
    key = str(lang_list)
 | 
			
		||||
    value = lang_to_lc_cache.get(key, None)
 | 
			
		||||
    value = _LANG_TO_LC_CACHE.get(key, None)
 | 
			
		||||
    if value is None:
 | 
			
		||||
        value = dict()
 | 
			
		||||
        for lc in lang_list:
 | 
			
		||||
            value.setdefault(lc.split('-')[0], lc)
 | 
			
		||||
        lang_to_lc_cache[key] = value
 | 
			
		||||
        value = {}
 | 
			
		||||
        for lang in lang_list:
 | 
			
		||||
            value.setdefault(lang.split('-')[0], lang)
 | 
			
		||||
        _LANG_TO_LC_CACHE[key] = value
 | 
			
		||||
    return value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# babel's get_global contains all sorts of miscellaneous locale and territory related data
 | 
			
		||||
# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
 | 
			
		||||
def _get_from_babel(lang_code, key):
 | 
			
		||||
def _get_from_babel(lang_code, key: str):
 | 
			
		||||
    match = get_global(key).get(lang_code.replace('-', '_'))
 | 
			
		||||
    # for some keys, such as territory_aliases, match may be a list
 | 
			
		||||
    if isinstance(match, str):
 | 
			
		||||
| 
						 | 
				
			
			@ -375,7 +374,7 @@ def _get_from_babel(lang_code, key):
 | 
			
		|||
    return match
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _match_language(lang_code, lang_list=[], custom_aliases={}):  # pylint: disable=W0102
 | 
			
		||||
def _match_language(lang_code, lang_list=[], custom_aliases={}) -> Optional[str]:  # pylint: disable=W0102
 | 
			
		||||
    """auxiliary function to match lang_code in lang_list"""
 | 
			
		||||
    # replace language code with a custom alias if necessary
 | 
			
		||||
    if lang_code in custom_aliases:
 | 
			
		||||
| 
						 | 
				
			
			@ -400,7 +399,7 @@ def _match_language(lang_code, lang_list=[], custom_aliases={}):  # pylint: disa
 | 
			
		|||
    return _get_lang_to_lc_dict(lang_list).get(lang_code, None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US'):  # pylint: disable=W0102
 | 
			
		||||
def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US') -> str:  # pylint: disable=W0102
 | 
			
		||||
    """get the language code from lang_list that best matches locale_code"""
 | 
			
		||||
    # try to get language from given locale_code
 | 
			
		||||
    language = _match_language(locale_code, lang_list, custom_aliases)
 | 
			
		||||
| 
						 | 
				
			
			@ -438,7 +437,7 @@ def match_language(locale_code, lang_list=[], custom_aliases={}, fallback='en-US
 | 
			
		|||
    return language or fallback
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_module(filename, module_dir):
 | 
			
		||||
def load_module(filename: str, module_dir: str):
 | 
			
		||||
    modname = splitext(filename)[0]
 | 
			
		||||
    filepath = join(module_dir, filename)
 | 
			
		||||
    # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
 | 
			
		||||
| 
						 | 
				
			
			@ -448,19 +447,16 @@ def load_module(filename, module_dir):
 | 
			
		|||
    return module
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def to_string(obj):
 | 
			
		||||
def to_string(obj: Any) -> str:
 | 
			
		||||
    """Convert obj to its string representation."""
 | 
			
		||||
    if isinstance(obj, str):
 | 
			
		||||
        return obj
 | 
			
		||||
    if isinstance(obj, Number):
 | 
			
		||||
        return str(obj)
 | 
			
		||||
    if hasattr(obj, '__str__'):
 | 
			
		||||
        return obj.__str__()
 | 
			
		||||
    if hasattr(obj, '__repr__'):
 | 
			
		||||
        return obj.__repr__()
 | 
			
		||||
    return repr(obj)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def ecma_unescape(s):
 | 
			
		||||
def ecma_unescape(string: str) -> str:
 | 
			
		||||
    """Python implementation of the unescape javascript function
 | 
			
		||||
 | 
			
		||||
    https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
 | 
			
		||||
| 
						 | 
				
			
			@ -475,23 +471,23 @@ def ecma_unescape(s):
 | 
			
		|||
        'ó'
 | 
			
		||||
    """
 | 
			
		||||
    # "%u5409" becomes "吉"
 | 
			
		||||
    s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s)
 | 
			
		||||
    string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
 | 
			
		||||
    # "%20" becomes " ", "%F3" becomes "ó"
 | 
			
		||||
    s = ecma_unescape2_re.sub(lambda e: chr(int(e.group(1), 16)), s)
 | 
			
		||||
    return s
 | 
			
		||||
    string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string)
 | 
			
		||||
    return string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_string_replaces_function(replaces):
 | 
			
		||||
def get_string_replaces_function(replaces: Dict[str, str]) -> Callable:
 | 
			
		||||
    rep = {re.escape(k): v for k, v in replaces.items()}
 | 
			
		||||
    pattern = re.compile("|".join(rep.keys()))
 | 
			
		||||
 | 
			
		||||
    def f(text):
 | 
			
		||||
    def func(text):
 | 
			
		||||
        return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
 | 
			
		||||
 | 
			
		||||
    return f
 | 
			
		||||
    return func
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_engine_from_settings(name):
 | 
			
		||||
def get_engine_from_settings(name: str) -> Dict:
 | 
			
		||||
    """Return engine configuration from settings.yml of a given engine name"""
 | 
			
		||||
 | 
			
		||||
    if 'engines' not in settings:
 | 
			
		||||
| 
						 | 
				
			
			@ -506,7 +502,7 @@ def get_engine_from_settings(name):
 | 
			
		|||
    return {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_xpath(xpath_spec):
 | 
			
		||||
def get_xpath(xpath_spec: XPathSpecType) -> XPath:
 | 
			
		||||
    """Return cached compiled XPath
 | 
			
		||||
 | 
			
		||||
    There is no thread lock.
 | 
			
		||||
| 
						 | 
				
			
			@ -523,13 +519,13 @@ def get_xpath(xpath_spec):
 | 
			
		|||
        * SearxXPathSyntaxException: Raise when there is a syntax error in the XPath
 | 
			
		||||
    """
 | 
			
		||||
    if isinstance(xpath_spec, str):
 | 
			
		||||
        result = xpath_cache.get(xpath_spec, None)
 | 
			
		||||
        result = _XPATH_CACHE.get(xpath_spec, None)
 | 
			
		||||
        if result is None:
 | 
			
		||||
            try:
 | 
			
		||||
                result = XPath(xpath_spec)
 | 
			
		||||
            except XPathSyntaxError as e:
 | 
			
		||||
                raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e
 | 
			
		||||
            xpath_cache[xpath_spec] = result
 | 
			
		||||
            _XPATH_CACHE[xpath_spec] = result
 | 
			
		||||
        return result
 | 
			
		||||
 | 
			
		||||
    if isinstance(xpath_spec, XPath):
 | 
			
		||||
| 
						 | 
				
			
			@ -538,7 +534,7 @@ def get_xpath(xpath_spec):
 | 
			
		|||
    raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_xpath(element, xpath_spec):
 | 
			
		||||
def eval_xpath(element: ElementBase, xpath_spec: XPathSpecType):
 | 
			
		||||
    """Equivalent of element.xpath(xpath_str) but compile xpath_str once for all.
 | 
			
		||||
    See https://lxml.de/xpathxslt.html#xpath-return-values
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -562,7 +558,7 @@ def eval_xpath(element, xpath_spec):
 | 
			
		|||
        raise SearxEngineXPathException(xpath_spec, arg) from e
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_xpath_list(element, xpath_spec, min_len=None):
 | 
			
		||||
def eval_xpath_list(element: ElementBase, xpath_spec: XPathSpecType, min_len: int = None):
 | 
			
		||||
    """Same as eval_xpath, check if the result is a list
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
| 
						 | 
				
			
			@ -586,7 +582,7 @@ def eval_xpath_list(element, xpath_spec, min_len=None):
 | 
			
		|||
    return result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def eval_xpath_getindex(elements, xpath_spec, index, default=NOTSET):
 | 
			
		||||
def eval_xpath_getindex(elements: ElementBase, xpath_spec: XPathSpecType, index: int, default=_NOTSET):
 | 
			
		||||
    """Call eval_xpath_list then get one element using the index parameter.
 | 
			
		||||
    If the index does not exist, either aise an exception is default is not set,
 | 
			
		||||
    other return the default value (can be None).
 | 
			
		||||
| 
						 | 
				
			
			@ -606,9 +602,9 @@ def eval_xpath_getindex(elements, xpath_spec, index, default=NOTSET):
 | 
			
		|||
        * result (bool, float, list, str): Results.
 | 
			
		||||
    """
 | 
			
		||||
    result = eval_xpath_list(elements, xpath_spec)
 | 
			
		||||
    if index >= -len(result) and index < len(result):
 | 
			
		||||
    if -len(result) <= index < len(result):
 | 
			
		||||
        return result[index]
 | 
			
		||||
    if default == NOTSET:
 | 
			
		||||
    if default == _NOTSET:
 | 
			
		||||
        # raise an SearxEngineXPathException instead of IndexError
 | 
			
		||||
        # to record xpath_spec
 | 
			
		||||
        raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -128,7 +128,7 @@ class TestUtils(SearxTestCase):
 | 
			
		|||
 | 
			
		||||
class TestHTMLTextExtractor(SearxTestCase):
 | 
			
		||||
    def setUp(self):
 | 
			
		||||
        self.html_text_extractor = utils.HTMLTextExtractor()
 | 
			
		||||
        self.html_text_extractor = utils._HTMLTextExtractor()
 | 
			
		||||
 | 
			
		||||
    def test__init__(self):
 | 
			
		||||
        self.assertEqual(self.html_text_extractor.result, [])
 | 
			
		||||
| 
						 | 
				
			
			@ -149,7 +149,7 @@ class TestHTMLTextExtractor(SearxTestCase):
 | 
			
		|||
 | 
			
		||||
    def test_invalid_html(self):
 | 
			
		||||
        text = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
 | 
			
		||||
        with self.assertRaises(utils.HTMLTextExtractorException):
 | 
			
		||||
        with self.assertRaises(utils._HTMLTextExtractorException):
 | 
			
		||||
            self.html_text_extractor.feed(text)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue