Merge pull request #2269 from return42/locale-revision

Revision of the locale- and language- handling in SearXNG
This commit is contained in:
Markus Heiser 2023-03-29 09:47:21 +02:00 committed by GitHub
commit f950119ca8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
75 changed files with 7823 additions and 6414 deletions

View File

@ -17,7 +17,7 @@ jobs:
- update_currencies.py
- update_external_bangs.py
- update_firefox_version.py
- update_languages.py
- update_engine_traits.py
- update_wikidata_units.py
- update_engine_descriptions.py
steps:

View File

@ -42,7 +42,7 @@ Explanation of the :ref:`general engine configuration` shown in the table
- Timeout
- Weight
- Paging
- Language
- Language, Region
- Safe search
- Time range

View File

@ -569,10 +569,13 @@ engine is shown. Most of the options have a default value or even are optional.
To disable by default the engine, but not deleting it. It will allow the user
to manually activate it in the settings.
``inactive``: optional
Remove the engine from the settings (*disabled & removed*).
``language`` : optional
If you want to use another language for a specific engine, you can define it
by using the full ISO code of language and country, like ``fr_FR``, ``en_US``,
``de_DE``.
by using the ISO code of language (and region), like ``fr``, ``en-US``,
``de-DE``.
``tokens`` : optional
A list of secret tokens to make this engine *private*, more details see

View File

@ -127,6 +127,10 @@ extensions = [
'notfound.extension', # https://github.com/readthedocs/sphinx-notfound-page
]
autodoc_default_options = {
'member-order': 'groupwise',
}
myst_enable_extensions = [
"replacements", "smartquotes"
]
@ -135,6 +139,7 @@ suppress_warnings = ['myst.domains']
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"babel" : ("https://babel.readthedocs.io/en/latest/", None),
"flask": ("https://flask.palletsprojects.com/", None),
"flask_babel": ("https://python-babel.github.io/flask-babel/", None),
# "werkzeug": ("https://werkzeug.palletsprojects.com/", None),

View File

@ -54,6 +54,7 @@ Engine File
- ``offline`` :ref:`[ref] <offline engines>`
- ``online_dictionary``
- ``online_currency``
- ``online_url_search``
======================= =========== ========================================================
.. _engine settings:
@ -131,8 +132,10 @@ Passed Arguments (request)
These arguments can be used to construct the search query. Furthermore,
parameters with default value can be redefined for special purposes.
.. _engine request online:
.. table:: If the ``engine_type`` is ``online``
.. table:: If the ``engine_type`` is :py:obj:`online
<searx.search.processors.online.OnlineProcessor.get_params>`
:width: 100%
====================== ============== ========================================================================
@ -149,12 +152,16 @@ parameters with default value can be redefined for special purposes.
safesearch int ``0``, between ``0`` and ``2`` (normal, moderate, strict)
time_range Optional[str] ``None``, can be ``day``, ``week``, ``month``, ``year``
pageno int current pagenumber
language str specific language code like ``'en_US'``, or ``'all'`` if unspecified
searxng_locale str SearXNG's locale selected by user. Specific language code like
``'en'``, ``'en-US'``, or ``'all'`` if unspecified.
====================== ============== ========================================================================
.. table:: If the ``engine_type`` is ``online_dictionary``, in addition to the
``online`` arguments:
.. _engine request online_dictionary:
.. table:: If the ``engine_type`` is :py:obj:`online_dictionary
<searx.search.processors.online_dictionary.OnlineDictionaryProcessor.get_params>`,
in addition to the :ref:`online <engine request online>` arguments:
:width: 100%
====================== ============== ========================================================================
@ -165,8 +172,11 @@ parameters with default value can be redefined for special purposes.
query str the text query without the languages
====================== ============== ========================================================================
.. table:: If the ``engine_type`` is ``online_currency```, in addition to the
``online`` arguments:
.. _engine request online_currency:
.. table:: If the ``engine_type`` is :py:obj:`online_currency
<searx.search.processors.online_currency.OnlineCurrencyProcessor.get_params>`,
in addition to the :ref:`online <engine request online>` arguments:
:width: 100%
====================== ============== ========================================================================
@ -179,6 +189,26 @@ parameters with default value can be redefined for special purposes.
to_name str currency name
====================== ============== ========================================================================
.. _engine request online_url_search:
.. table:: If the ``engine_type`` is :py:obj:`online_url_search
<searx.search.processors.online_url_search.OnlineUrlSearchProcessor.get_params>`,
in addition to the :ref:`online <engine request online>` arguments:
:width: 100%
====================== ============== ========================================================================
argument type default-value, information
====================== ============== ========================================================================
search_url dict URLs from the search query:
.. code:: python
{
'http': str,
'ftp': str,
'data:image': str
}
====================== ============== ========================================================================
Specify Request
---------------

View File

@ -52,12 +52,12 @@ Scripts to update static data in :origin:`searx/data/`
:members:
``update_languages.py``
=======================
``update_engine_traits.py``
===========================
:origin:`[source] <searxng_extra/update/update_languages.py>`
:origin:`[source] <searxng_extra/update/update_engine_traits.py>`
.. automodule:: searxng_extra.update.update_languages
.. automodule:: searxng_extra.update.update_engine_traits
:members:

View File

@ -0,0 +1,9 @@
.. _archlinux engine:
==========
Arch Linux
==========
.. automodule:: searx.engines.archlinux
:members:

View File

@ -0,0 +1,8 @@
.. _dailymotion engine:
===========
Dailymotion
===========
.. automodule:: searx.engines.dailymotion
:members:

View File

@ -0,0 +1,22 @@
.. _duckduckgo engines:
=================
DukcDukGo engines
=================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.duckduckgo
:members:
.. automodule:: searx.engines.duckduckgo_images
:members:
.. automodule:: searx.engines.duckduckgo_definitions
:members:
.. automodule:: searx.engines.duckduckgo_weather
:members:

View File

@ -0,0 +1,17 @@
.. _searx.enginelib:
============
Engine model
============
.. automodule:: searx.enginelib
:members:
.. _searx.enginelib.traits:
=============
Engine traits
=============
.. automodule:: searx.enginelib.traits
:members:

View File

@ -0,0 +1,43 @@
.. _bing engines:
============
Bing Engines
============
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _bing web engine:
Bing WEB
========
.. automodule:: searx.engines.bing
:members:
.. _bing images engine:
Bing Images
===========
.. automodule:: searx.engines.bing_images
:members:
.. _bing videos engine:
Bing Videos
===========
.. automodule:: searx.engines.bing_videos
:members:
.. _bing news engine:
Bing News
=========
.. automodule:: searx.engines.bing_news
:members:

View File

@ -12,15 +12,21 @@ Google Engines
.. _google API:
google API
Google API
==========
.. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
SearXNG's implementation of the Google API is mainly done in
:py:obj:`get_google_info <searx.engines.google.get_google_info>`.
For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_. Not all parameters can be appied and some engines are *special*
(e.g. :ref:`google news engine`).
Definitions`_. The linked API documentation can sometimes be helpful during
reverse engineering. However, we cannot use it in the freely accessible WEB
services; not all parameters can be applied and some engines are more *special*
than other (e.g. :ref:`google news engine`).
.. _google web engine:
@ -30,6 +36,13 @@ Google WEB
.. automodule:: searx.engines.google
:members:
.. _google autocomplete:
Google Autocomplete
====================
.. autofunction:: searx.autocomplete.google_complete
.. _google images engine:
Google Images
@ -53,3 +66,11 @@ Google News
.. automodule:: searx.engines.google_news
:members:
.. _google scholar engine:
Google Scholar
==============
.. automodule:: searx.engines.google_scholar
:members:

View File

@ -0,0 +1,27 @@
.. _peertube engines:
================
Peertube Engines
================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _peertube video engine:
Peertube Video
==============
.. automodule:: searx.engines.peertube
:members:
.. _sepiasearch engine:
SepiaSearch
===========
.. automodule:: searx.engines.sepiasearch
:members:

View File

@ -1,8 +1,8 @@
.. _load_engines:
.. _searx.engines:
============
Load Engines
============
=================
SearXNG's engines
=================
.. automodule:: searx.engines
:members:

View File

@ -0,0 +1,13 @@
.. _startpage engines:
=================
Startpage engines
=================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.engines.startpage
:members:

View File

@ -0,0 +1,27 @@
.. _wikimedia engines:
=========
Wikimedia
=========
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. _wikipedia engine:
Wikipedia
=========
.. automodule:: searx.engines.wikipedia
:members:
.. _wikidata engine:
Wikidata
=========
.. automodule:: searx.engines.wikidata
:members:

View File

@ -4,5 +4,17 @@
Locales
=======
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
.. automodule:: searx.locales
:members:
SearXNG's locale codes
======================
.. automodule:: searx.sxng_locales
:members:

View File

@ -0,0 +1,47 @@
.. _searx.search.processors:
=================
Search processors
=================
.. contents:: Contents
:depth: 2
:local:
:backlinks: entry
Abstract processor class
========================
.. automodule:: searx.search.processors.abstract
:members:
Offline processor
=================
.. automodule:: searx.search.processors.offline
:members:
Online processor
================
.. automodule:: searx.search.processors.online
:members:
Online currency processor
=========================
.. automodule:: searx.search.processors.online_currency
:members:
Online Dictionary processor
===========================
.. automodule:: searx.search.processors.online_dictionary
:members:
Online URL search processor
===========================
.. automodule:: searx.search.processors.online_url_search
:members:

2
manage
View File

@ -63,7 +63,7 @@ PYLINT_SEARXNG_DISABLE_OPTION="\
I,C,R,\
W0105,W0212,W0511,W0603,W0613,W0621,W0702,W0703,W1401,\
E1136"
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="supported_languages,language_aliases,logger,categories"
PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES="traits,supported_languages,language_aliases,logger,categories"
PYLINT_OPTIONS="-m pylint -j 0 --rcfile .pylintrc"
help() {

View File

@ -1,5 +1,5 @@
certifi==2022.12.7
babel==2.11.0
babel==2.12.1
flask-babel==3.0.1
flask==2.2.3
jinja2==3.1.2

View File

@ -5,20 +5,20 @@
"""
# pylint: disable=use-dict-literal
from json import loads
import json
from urllib.parse import urlencode
from lxml import etree
import lxml
from httpx import HTTPError
from searx import settings
from searx.data import ENGINES_LANGUAGES
from searx.engines import (
engines,
google,
)
from searx.network import get as http_get
from searx.exceptions import SearxEngineResponseException
# a fetch_supported_languages() for XPath engines isn't available right now
# _brave = ENGINES_LANGUAGES['brave'].keys()
def get(*args, **kwargs):
if 'timeout' not in kwargs:
@ -55,34 +55,58 @@ def dbpedia(query, _lang):
results = []
if response.ok:
dom = etree.fromstring(response.content)
dom = lxml.etree.fromstring(response.content)
results = dom.xpath('//Result/Label//text()')
return results
def duckduckgo(query, _lang):
# duckduckgo autocompleter
url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
def duckduckgo(query, sxng_locale):
"""Autocomplete from DuckDuckGo. Supports DuckDuckGo's languages"""
resp = loads(get(url.format(urlencode(dict(q=query)))).text)
if len(resp) > 1:
return resp[1]
return []
traits = engines['duckduckgo'].traits
args = {
'q': query,
'kl': traits.get_region(sxng_locale, traits.all_locale),
}
url = 'https://duckduckgo.com/ac/?type=list&' + urlencode(args)
resp = get(url)
ret_val = []
if resp.ok:
j = resp.json()
if len(j) > 1:
ret_val = j[1]
return ret_val
def google(query, lang):
# google autocompleter
autocomplete_url = 'https://suggestqueries.google.com/complete/search?client=toolbar&'
def google_complete(query, sxng_locale):
"""Autocomplete from Google. Supports Google's languages and subdomains
(:py:obj:`searx.engines.google.get_google_info`) by using the async REST
API::
response = get(autocomplete_url + urlencode(dict(hl=lang, q=query)))
https://{subdomain}/complete/search?{args}
"""
google_info = google.get_google_info({'searxng_locale': sxng_locale}, engines['google'].traits)
url = 'https://{subdomain}/complete/search?{args}'
args = urlencode(
{
'q': query,
'client': 'gws-wiz',
'hl': google_info['params']['hl'],
}
)
results = []
if response.ok:
dom = etree.fromstring(response.text)
results = dom.xpath('//suggestion/@data')
resp = get(url.format(subdomain=google_info['subdomain'], args=args))
if resp.ok:
json_txt = resp.text[resp.text.find('[') : resp.text.find(']', -3) + 1]
data = json.loads(json_txt)
for item in data[0]:
results.append(lxml.html.fromstring(item[0]).text_content())
return results
@ -109,9 +133,9 @@ def seznam(query, _lang):
]
def startpage(query, lang):
# startpage autocompleter
lui = ENGINES_LANGUAGES['startpage'].get(lang, 'english')
def startpage(query, sxng_locale):
"""Autocomplete from Startpage. Supports Startpage's languages"""
lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
url = 'https://startpage.com/suggestions?{query}'
resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
data = resp.json()
@ -122,20 +146,20 @@ def swisscows(query, _lang):
# swisscows autocompleter
url = 'https://swisscows.ch/api/suggest?{query}&itemsCount=5'
resp = loads(get(url.format(query=urlencode({'query': query}))).text)
resp = json.loads(get(url.format(query=urlencode({'query': query}))).text)
return resp
def qwant(query, lang):
# qwant autocompleter (additional parameter : lang=en_en&count=xxx )
url = 'https://api.qwant.com/api/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'lang': lang})))
def qwant(query, sxng_locale):
"""Autocomplete from Qwant. Supports Qwant's regions."""
results = []
locale = engines['qwant'].traits.get_region(sxng_locale, 'en_US')
url = 'https://api.qwant.com/v3/suggest?{query}'
resp = get(url.format(query=urlencode({'q': query, 'locale': locale, 'version': '2'})))
if resp.ok:
data = loads(resp.text)
data = resp.json()
if data['status'] == 'success':
for item in data['data']['items']:
results.append(item['value'])
@ -143,21 +167,38 @@ def qwant(query, lang):
return results
def wikipedia(query, lang):
# wikipedia autocompleter
url = 'https://' + lang + '.wikipedia.org/w/api.php?action=opensearch&{0}&limit=10&namespace=0&format=json'
def wikipedia(query, sxng_locale):
"""Autocomplete from Wikipedia. Supports Wikipedia's languages (aka netloc)."""
results = []
eng_traits = engines['wikipedia'].traits
wiki_lang = eng_traits.get_language(sxng_locale, 'en')
wiki_netloc = eng_traits.custom['wiki_netloc'].get(wiki_lang, 'en.wikipedia.org')
resp = loads(get(url.format(urlencode(dict(search=query)))).text)
if len(resp) > 1:
return resp[1]
return []
url = 'https://{wiki_netloc}/w/api.php?{args}'
args = urlencode(
{
'action': 'opensearch',
'format': 'json',
'formatversion': '2',
'search': query,
'namespace': '0',
'limit': '10',
}
)
resp = get(url.format(args=args, wiki_netloc=wiki_netloc))
if resp.ok:
data = resp.json()
if len(data) > 1:
results = data[1]
return results
def yandex(query, _lang):
# yandex autocompleter
url = "https://suggest.yandex.com/suggest-ff.cgi?{0}"
resp = loads(get(url.format(urlencode(dict(part=query)))).text)
resp = json.loads(get(url.format(urlencode(dict(part=query)))).text)
if len(resp) > 1:
return resp[1]
return []
@ -166,7 +207,7 @@ def yandex(query, _lang):
backends = {
'dbpedia': dbpedia,
'duckduckgo': duckduckgo,
'google': google,
'google': google_complete,
'seznam': seznam,
'startpage': startpage,
'swisscows': swisscows,
@ -177,12 +218,11 @@ backends = {
}
def search_autocomplete(backend_name, query, lang):
def search_autocomplete(backend_name, query, sxng_locale):
backend = backends.get(backend_name)
if backend is None:
return []
try:
return backend(query, lang)
return backend(query, sxng_locale)
except (HTTPError, SearxEngineResponseException):
return []

View File

@ -7,7 +7,7 @@
"""
__all__ = [
'ENGINES_LANGUAGES',
'ENGINE_TRAITS',
'CURRENCIES',
'USER_AGENTS',
'EXTERNAL_URLS',
@ -42,7 +42,6 @@ def ahmia_blacklist_loader():
return f.read().split()
ENGINES_LANGUAGES = _load('engines_languages.json')
CURRENCIES = _load('currencies.json')
USER_AGENTS = _load('useragents.json')
EXTERNAL_URLS = _load('external_urls.json')
@ -50,3 +49,4 @@ WIKIDATA_UNITS = _load('wikidata_units.json')
EXTERNAL_BANGS = _load('external_bangs.json')
OSM_KEYS_TAGS = _load('osm_keys_tags.json')
ENGINE_DESCRIPTIONS = _load('engine_descriptions.json')
ENGINE_TRAITS = _load('engine_traits.json')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

136
searx/enginelib/__init__.py Normal file
View File

@ -0,0 +1,136 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Engine related implementations
.. note::
The long term goal is to modularize all relevant implementations to the
engines here in this Python package. In addition to improved modularization,
this will also be necessary in part because the probability of circular
imports will increase due to the increased typification of implementations in
the future.
ToDo:
- move :py:obj:`searx.engines.load_engine` to a new module `searx.enginelib`.
"""
from __future__ import annotations
from typing import Union, Dict, List, Callable, TYPE_CHECKING
if TYPE_CHECKING:
from searx.enginelib import traits
class Engine: # pylint: disable=too-few-public-methods
"""Class of engine instances build from YAML settings.
Further documentation see :ref:`general engine configuration`.
.. hint::
This class is currently never initialized and only used for type hinting.
"""
# Common options in the engine module
engine_type: str
"""Type of the engine (:origin:`searx/search/processors`)"""
paging: bool
"""Engine supports multiple pages."""
time_range_support: bool
"""Engine supports search time range."""
safesearch: bool
"""Engine supports SafeSearch"""
language_support: bool
"""Engine supports languages (locales) search."""
language: str
"""For an engine, when there is ``language: ...`` in the YAML settings the engine
does support only this one language:
.. code:: yaml
- name: google french
engine: google
language: fr
"""
region: str
"""For an engine, when there is ``region: ...`` in the YAML settings the engine
does support only this one region::
.. code:: yaml
- name: google belgium
engine: google
region: fr-BE
"""
fetch_traits: Callable
"""Function to to fetch engine's traits from origin."""
traits: traits.EngineTraits
"""Traits of the engine."""
# settings.yml
categories: List[str]
"""Tabs, in which the engine is working."""
name: str
"""Name that will be used across SearXNG to define this engine. In settings, on
the result page .."""
engine: str
"""Name of the python file used to handle requests and responses to and from
this search engine (file name from :origin:`searx/engines` without
``.py``)."""
enable_http: bool
"""Enable HTTP (by default only HTTPS is enabled)."""
shortcut: str
"""Code used to execute bang requests (``!foo``)"""
timeout: float
"""Specific timeout for search-engine."""
display_error_messages: bool
"""Display error messages on the web UI."""
proxies: dict
"""Set proxies for a specific engine (YAML):
.. code:: yaml
proxies :
http: socks5://proxy:port
https: socks5://proxy:port
"""
disabled: bool
"""To disable by default the engine, but not deleting it. It will allow the
user to manually activate it in the settings."""
inactive: bool
"""Remove the engine from the settings (*disabled & removed*)."""
about: dict
"""Additional fileds describing the engine.
.. code:: yaml
about:
website: https://example.com
wikidata_id: Q306656
official_api_documentation: https://example.com/api-doc
use_official_api: true
require_api_key: true
results: HTML
"""

250
searx/enginelib/traits.py Normal file
View File

@ -0,0 +1,250 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Engine's traits are fetched from the origin engines and stored in a JSON file
in the *data folder*. Most often traits are languages and region codes and
their mapping from SearXNG's representation to the representation in the origin
search engine. For new traits new properties can be added to the class
:py:class:`EngineTraits`.
To load traits from the persistence :py:obj:`EngineTraitsMap.from_data` can be
used.
"""
from __future__ import annotations
import json
import dataclasses
from typing import Dict, Union, Callable, Optional, TYPE_CHECKING
from typing_extensions import Literal, Self
from searx import locales
from searx.data import data_dir, ENGINE_TRAITS
if TYPE_CHECKING:
from . import Engine
class EngineTraitsEncoder(json.JSONEncoder):
"""Encodes :class:`EngineTraits` to a serializable object, see
:class:`json.JSONEncoder`."""
def default(self, o):
"""Return dictionary of a :class:`EngineTraits` object."""
if isinstance(o, EngineTraits):
return o.__dict__
return super().default(o)
@dataclasses.dataclass
class EngineTraits:
"""The class is intended to be instantiated for each engine."""
regions: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a region to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
regions ={
'fr-BE' : <engine's region name>,
}
for key, egnine_region regions.items():
searxng_region = babel.Locale.parse(key, sep='-')
...
"""
languages: Dict[str, str] = dataclasses.field(default_factory=dict)
"""Maps SearXNG's internal representation of a language to the one of the engine.
SearXNG's internal representation can be parsed by babel and the value is
send to the engine:
.. code:: python
languages = {
'ca' : <engine's language name>,
}
for key, egnine_lang in languages.items():
searxng_lang = babel.Locale.parse(key)
...
"""
all_locale: Optional[str] = None
"""To which locale value SearXNG's ``all`` language is mapped (shown a "Default
language").
"""
data_type: Literal['traits_v1'] = 'traits_v1'
"""Data type, default is 'traits_v1'.
"""
custom: Dict[str, Dict] = dataclasses.field(default_factory=dict)
"""A place to store engine's custom traits, not related to the SearXNG core
"""
def get_language(self, searxng_locale: str, default=None):
"""Return engine's language string that *best fits* to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default language
The *best fits* rules are implemented in
:py:obj:`locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj`EngineTraits.all_language`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.languages, default=default)
def get_region(self, searxng_locale: str, default=None):
"""Return engine's region string that best fits to SearXNG's locale.
:param searxng_locale: SearXNG's internal representation of locale
selected by the user.
:param default: engine's default region
The *best fits* rules are implemented in
:py:obj:`locales.get_engine_locale`. Except for the special value ``all``
which is determined from :py:obj`EngineTraits.all_language`.
"""
if searxng_locale == 'all' and self.all_locale is not None:
return self.all_locale
return locales.get_engine_locale(searxng_locale, self.regions, default=default)
def is_locale_supported(self, searxng_locale: str) -> bool:
"""A *locale* (SearXNG's internal representation) is considered to be supported
by the engine if the *region* or the *language* is supported by the
engine. For verification the functions :py:func:`self.get_region` and
:py:func:`self.get_region` are used.
"""
if self.data_type == 'traits_v1':
return bool(self.get_region(searxng_locale) or self.get_language(searxng_locale))
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def copy(self):
"""Create a copy of the dataclass object."""
return EngineTraits(**dataclasses.asdict(self))
@classmethod
def fetch_traits(cls, engine: Engine) -> Union[Self, None]:
"""Call a function ``fetch_traits(engine_traits)`` from engines namespace to fetch
and set properties from the origin engine in the object ``engine_traits``. If
function does not exists, ``None`` is returned.
"""
fetch_traits = getattr(engine, 'fetch_traits', None)
engine_traits = None
if fetch_traits:
engine_traits = cls()
fetch_traits(engine_traits)
return engine_traits
def set_traits(self, engine: Engine):
"""Set traits from self object in a :py:obj:`.Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
if self.data_type == 'traits_v1':
self._set_traits_v1(engine)
else:
raise TypeError('engine traits of type %s is unknown' % self.data_type)
def _set_traits_v1(self, engine: Engine):
# For an engine, when there is `language: ...` in the YAML settings the engine
# does support only this one language (region)::
#
# - name: google italian
# engine: google
# language: it
# region: it-IT
traits = self.copy()
_msg = "settings.yml - engine: '%s' / %s: '%s' not supported"
languages = traits.languages
if hasattr(engine, 'language'):
if engine.language not in languages:
raise ValueError(_msg % (engine.name, 'language', engine.language))
traits.languages = {engine.language: languages[engine.language]}
regions = traits.regions
if hasattr(engine, 'region'):
if engine.region not in regions:
raise ValueError(_msg % (engine.name, 'region', engine.region))
traits.regions = {engine.region: regions[engine.region]}
engine.language_support = bool(traits.languages or traits.regions)
# set the copied & modified traits in engine's namespace
engine.traits = traits
class EngineTraitsMap(Dict[str, EngineTraits]):
"""A python dictionary to map :class:`EngineTraits` by engine name."""
ENGINE_TRAITS_FILE = (data_dir / 'engine_traits.json').resolve()
"""File with persistence of the :py:obj:`EngineTraitsMap`."""
def save_data(self):
"""Store EngineTraitsMap in in file :py:obj:`self.ENGINE_TRAITS_FILE`"""
with open(self.ENGINE_TRAITS_FILE, 'w', encoding='utf-8') as f:
json.dump(self, f, indent=2, sort_keys=True, cls=EngineTraitsEncoder)
@classmethod
def from_data(cls) -> Self:
"""Instantiate :class:`EngineTraitsMap` object from :py:obj:`ENGINE_TRAITS`"""
obj = cls()
for k, v in ENGINE_TRAITS.items():
obj[k] = EngineTraits(**v)
return obj
@classmethod
def fetch_traits(cls, log: Callable) -> Self:
from searx import engines # pylint: disable=cyclic-import, import-outside-toplevel
names = list(engines.engines)
names.sort()
obj = cls()
for engine_name in names:
engine = engines.engines[engine_name]
traits = EngineTraits.fetch_traits(engine)
if traits is not None:
log("%-20s: SearXNG languages --> %s " % (engine_name, len(traits.languages)))
log("%-20s: SearXNG regions --> %s" % (engine_name, len(traits.regions)))
obj[engine_name] = traits
return obj
def set_traits(self, engine: Engine):
"""Set traits in a :py:obj:`Engine` namespace.
:param engine: engine instance build by :py:func:`searx.engines.load_engine`
"""
engine_traits = EngineTraits(data_type='traits_v1')
if engine.name in self.keys():
engine_traits = self[engine.name]
elif engine.engine in self.keys():
# The key of the dictionary traits_map is the *engine name*
# configured in settings.xml. When multiple engines are configured
# in settings.yml to use the same origin engine (python module)
# these additional engines can use the languages from the origin
# engine. For this use the configured ``engine: ...`` from
# settings.yml
engine_traits = self[engine.engine]
engine_traits.set_traits(engine)

View File

@ -11,24 +11,22 @@ usage::
"""
from __future__ import annotations
import sys
import copy
from typing import Dict, List, Optional
from os.path import realpath, dirname
from babel.localedata import locale_identifiers
from searx import logger, settings
from searx.data import ENGINES_LANGUAGES
from searx.network import get
from searx.utils import load_module, match_language, gen_useragent
from typing import TYPE_CHECKING, Dict, Optional
from searx import logger, settings
from searx.utils import load_module
if TYPE_CHECKING:
from searx.enginelib import Engine
logger = logger.getChild('engines')
ENGINE_DIR = dirname(realpath(__file__))
BABEL_LANGS = [
lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0]
for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())
]
ENGINE_DEFAULT_ARGS = {
"engine_type": "online",
"inactive": False,
@ -36,8 +34,6 @@ ENGINE_DEFAULT_ARGS = {
"timeout": settings["outgoing"]["request_timeout"],
"shortcut": "-",
"categories": ["general"],
"supported_languages": [],
"language_aliases": {},
"paging": False,
"safesearch": False,
"time_range_support": False,
@ -52,24 +48,6 @@ ENGINE_DEFAULT_ARGS = {
OTHER_CATEGORY = 'other'
class Engine: # pylint: disable=too-few-public-methods
"""This class is currently never initialized and only used for type hinting."""
name: str
engine: str
shortcut: str
categories: List[str]
supported_languages: List[str]
about: dict
inactive: bool
disabled: bool
language_support: bool
paging: bool
safesearch: bool
time_range_support: bool
timeout: float
# Defaults for the namespace of an engine module, see :py:func:`load_engine`
categories = {'general': []}
@ -136,9 +114,15 @@ def load_engine(engine_data: dict) -> Optional[Engine]:
return None
update_engine_attributes(engine, engine_data)
set_language_attributes(engine)
update_attributes_for_tor(engine)
# avoid cyclic imports
# pylint: disable=import-outside-toplevel
from searx.enginelib.traits import EngineTraitsMap
trait_map = EngineTraitsMap.from_data()
trait_map.set_traits(engine)
if not is_engine_active(engine):
return None
@ -190,60 +174,6 @@ def update_engine_attributes(engine: Engine, engine_data):
setattr(engine, arg_name, copy.deepcopy(arg_value))
def set_language_attributes(engine: Engine):
# assign supported languages from json file
if engine.name in ENGINES_LANGUAGES:
engine.supported_languages = ENGINES_LANGUAGES[engine.name]
elif engine.engine in ENGINES_LANGUAGES:
# The key of the dictionary ENGINES_LANGUAGES is the *engine name*
# configured in settings.xml. When multiple engines are configured in
# settings.yml to use the same origin engine (python module) these
# additional engines can use the languages from the origin engine.
# For this use the configured ``engine: ...`` from settings.yml
engine.supported_languages = ENGINES_LANGUAGES[engine.engine]
if hasattr(engine, 'language'):
# For an engine, when there is `language: ...` in the YAML settings, the
# engine supports only one language, in this case
# engine.supported_languages should contains this value defined in
# settings.yml
if engine.language not in engine.supported_languages:
raise ValueError(
"settings.yml - engine: '%s' / language: '%s' not supported" % (engine.name, engine.language)
)
if isinstance(engine.supported_languages, dict):
engine.supported_languages = {engine.language: engine.supported_languages[engine.language]}
else:
engine.supported_languages = [engine.language]
# find custom aliases for non standard language codes
for engine_lang in engine.supported_languages:
iso_lang = match_language(engine_lang, BABEL_LANGS, fallback=None)
if (
iso_lang
and iso_lang != engine_lang
and not engine_lang.startswith(iso_lang)
and iso_lang not in engine.supported_languages
):
engine.language_aliases[iso_lang] = engine_lang
# language_support
engine.language_support = len(engine.supported_languages) > 0
# assign language fetching method if auxiliary method exists
if hasattr(engine, '_fetch_supported_languages'):
headers = {
'User-Agent': gen_useragent(),
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
}
engine.fetch_supported_languages = (
# pylint: disable=protected-access
lambda: engine._fetch_supported_languages(get(engine.supported_languages_url, headers=headers))
)
def update_attributes_for_tor(engine: Engine) -> bool:
if using_tor_proxy(engine) and hasattr(engine, 'onion_url'):
engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')

View File

@ -1,15 +1,32 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
Arch Linux Wiki
~~~~~~~~~~~~~~~
This implementation does not use a official API: Mediawiki provides API, but
Arch Wiki blocks access to it.
API: Mediawiki provides API, but Arch Wiki blocks access to it
"""
from urllib.parse import urlencode, urljoin
from lxml import html
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urljoin, urlparse
import lxml
import babel
from searx import network
from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex
from searx.enginelib.traits import EngineTraits
from searx.locales import language_tag
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://wiki.archlinux.org/',
"wikidata_id": 'Q101445877',
@ -22,125 +39,113 @@ about = {
# engine dependent config
categories = ['it', 'software wikis']
paging = True
base_url = 'https://wiki.archlinux.org'
# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'
main_wiki = 'wiki.archlinux.org'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
def locale_to_lang_code(locale):
if locale.find('-') >= 0:
locale = locale.split('-')[0]
return locale
# wikis for some languages were moved off from the main site, we need to make
# requests to correct URLs to be able to get results in those languages
lang_urls = {
# fmt: off
'all': {
'base': 'https://wiki.archlinux.org',
'search': '/index.php?title=Special:Search&offset={offset}&{query}'
},
'de': {
'base': 'https://wiki.archlinux.de',
'search': '/index.php?title=Spezial:Suche&offset={offset}&{query}'
},
'fr': {
'base': 'https://wiki.archlinux.fr',
'search': '/index.php?title=Spécial:Recherche&offset={offset}&{query}'
},
'ja': {
'base': 'https://wiki.archlinuxjp.org',
'search': '/index.php?title=特別:検索&offset={offset}&{query}'
},
'ro': {
'base': 'http://wiki.archlinux.ro',
'search': '/index.php?title=Special:Căutare&offset={offset}&{query}'
},
'tr': {
'base': 'http://archtr.org/wiki',
'search': '/index.php?title=Özel:Ara&offset={offset}&{query}'
}
# fmt: on
}
# get base & search URLs for selected language
def get_lang_urls(language):
if language in lang_urls:
return lang_urls[language]
return lang_urls['all']
# Language names to build search requests for
# those languages which are hosted on the main site.
main_langs = {
'ar': 'العربية',
'bg': 'Български',
'cs': 'Česky',
'da': 'Dansk',
'el': 'Ελληνικά',
'es': 'Español',
'he': 'עברית',
'hr': 'Hrvatski',
'hu': 'Magyar',
'it': 'Italiano',
'ko': '한국어',
'lt': 'Lietuviškai',
'nl': 'Nederlands',
'pl': 'Polski',
'pt': 'Português',
'ru': 'Русский',
'sl': 'Slovenský',
'th': 'ไทย',
'uk': 'Українська',
'zh': '简体中文',
}
supported_languages = dict(lang_urls, **main_langs)
# do search-request
def request(query, params):
# translate the locale (e.g. 'en-US') to language code ('en')
language = locale_to_lang_code(params['language'])
# if our language is hosted on the main site, we need to add its name
# to the query in order to narrow the results to that language
if language in main_langs:
query += ' (' + main_langs[language] + ')'
# prepare the request parameters
query = urlencode({'search': query})
sxng_lang = params['searxng_locale'].split('-')[0]
netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki)
title = traits.custom['title'].get(sxng_lang, 'Special:Search')
base_url = 'https://' + netloc + '/index.php?'
offset = (params['pageno'] - 1) * 20
# get request URLs for our language of choice
urls = get_lang_urls(language)
search_url = urls['base'] + urls['search']
if netloc == main_wiki:
eng_lang: str = traits.get_language(sxng_lang, 'English')
query += ' (' + eng_lang + ')'
elif netloc == 'wiki.archlinuxcn.org':
base_url = 'https://' + netloc + '/wzh/index.php?'
params['url'] = search_url.format(query=query, offset=offset)
args = {
'search': query,
'title': title,
'limit': 20,
'offset': offset,
'profile': 'default',
}
params['url'] = base_url + urlencode(args)
return params
# get response from search-request
def response(resp):
# get the base URL for the language in which request was made
language = locale_to_lang_code(resp.search_params['language'])
base_url = get_lang_urls(language)['base']
results = []
dom = lxml.html.fromstring(resp.text)
dom = html.fromstring(resp.text)
# get the base URL for the language in which request was made
sxng_lang = resp.search_params['searxng_locale'].split('-')[0]
netloc = traits.custom['wiki_netloc'].get(sxng_lang, main_wiki)
base_url = 'https://' + netloc + '/index.php?'
# parse results
for result in eval_xpath_list(dom, xpath_results):
link = eval_xpath_getindex(result, xpath_link, 0)
href = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link)
results.append({'url': href, 'title': title})
for result in eval_xpath_list(dom, '//ul[@class="mw-search-results"]/li'):
link = eval_xpath_getindex(result, './/div[@class="mw-search-result-heading"]/a', 0)
content = extract_text(result.xpath('.//div[@class="searchresult"]'))
results.append(
{
'url': urljoin(base_url, link.get('href')),
'title': extract_text(link),
'content': content,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Archlinix-Wiki. The location of the Wiki address of a
language is mapped in a :py:obj:`custom field
<searx.enginelib.traits.EngineTraits.custom>` (``wiki_netloc``). Depending
on the location, the ``title`` argument in the request is translated.
.. code:: python
"custom": {
"wiki_netloc": {
"de": "wiki.archlinux.de",
# ...
"zh": "wiki.archlinuxcn.org"
}
"title": {
"de": "Spezial:Suche",
# ...
"zh": "Special:\u641c\u7d22"
},
},
"""
engine_traits.custom['wiki_netloc'] = {}
engine_traits.custom['title'] = {}
title_map = {
'de': 'Spezial:Suche',
'fa': 'ویژه:جستجو',
'ja': '特別:検索',
'zh': 'Special:搜索',
}
resp = network.get('https://wiki.archlinux.org/')
if not resp.ok:
print("ERROR: response from wiki.archlinix.org is not OK.")
dom = lxml.html.fromstring(resp.text)
for a in eval_xpath_list(dom, "//a[@class='interlanguage-link-target']"):
sxng_tag = language_tag(babel.Locale.parse(a.get('lang'), sep='-'))
# zh_Hans --> zh
sxng_tag = sxng_tag.split('_')[0]
netloc = urlparse(a.get('href')).netloc
if netloc != 'wiki.archlinux.org':
title = title_map.get(sxng_tag)
if not title:
print("ERROR: title tag from %s (%s) is unknown" % (netloc, sxng_tag))
continue
engine_traits.custom['wiki_netloc'][sxng_tag] = netloc
engine_traits.custom['title'][sxng_tag] = title
eng_tag = extract_text(eval_xpath_list(a, ".//span"))
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['en'] = 'English'

View File

@ -1,16 +1,53 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing (Web)
"""This is the implementation of the Bing-WEB engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
On the `preference page`_ Bing offers a lot of languages an regions (see section
'Search results languages' and 'Country/region'). However, the abundant choice
does not correspond to reality, where Bing has a full-text indexer only for a
limited number of languages. By example: you can select a language like Māori
but you never get a result in this language.
What comes a bit closer to the truth are the `search-APIs`_ but they don`t seem
to be completely correct either (if you take a closer look you will find some
inaccuracies there too):
- :py:obj:`searx.engines.bing.bing_traits_url`
- :py:obj:`searx.engines.bing_videos.bing_traits_url`
- :py:obj:`searx.engines.bing_images.bing_traits_url`
- :py:obj:`searx.engines.bing_news.bing_traits_url`
.. _preference page: https://www.bing.com/account/general
.. _search-APIs: https://learn.microsoft.com/en-us/bing/search-apis/
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""
# pylint: disable=too-many-branches
# pylint: disable=too-many-branches, invalid-name
from typing import TYPE_CHECKING
import datetime
import re
from urllib.parse import urlencode, urlparse, parse_qs
import uuid
from urllib.parse import urlencode
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
from searx.network import multi_requests, Request
import babel
import babel.languages
from searx.utils import eval_xpath, extract_text, eval_xpath_list, eval_xpath_getindex
from searx import network
from searx.locales import language_tag, region_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com',
@ -21,56 +58,124 @@ about = {
"results": 'HTML',
}
send_accept_language_header = True
"""Bing tries to guess user's language and territory from the HTTP
Accept-Language. Optional the user can select a search-language (can be
different to the UI language) and a region (market code)."""
# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = False
safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {}
time_range_support = True
safesearch = True
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'} # cookie: ADLT=STRICT
# search-url
base_url = 'https://www.bing.com/'
base_url = 'https://www.bing.com/search'
"""Bing (Web) search URL"""
# initial query: https://www.bing.com/search?q=foo&search=&form=QBLH
inital_query = 'search?{query}&search=&form=QBLH'
# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
page_query = 'search?{query}&search=&first={offset}&FORM=PERE'
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/reference/market-codes'
"""Bing (Web) search API description"""
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 10 + 1
def set_bing_cookies(params, engine_language, engine_region, SID):
# set cookies
# -----------
params['cookies']['_EDGE_V'] = '1'
# _EDGE_S: F=1&SID=3A5253BD6BCA609509B741876AF961CA&mkt=zh-tw
_EDGE_S = [
'F=1',
'SID=%s' % SID,
'mkt=%s' % engine_region.lower(),
'ui=%s' % engine_language.lower(),
]
params['cookies']['_EDGE_S'] = '&'.join(_EDGE_S)
logger.debug("cookie _EDGE_S=%s", params['cookies']['_EDGE_S'])
# "_EDGE_CD": "m=zh-tw",
_EDGE_CD = [ # pylint: disable=invalid-name
'm=%s' % engine_region.lower(), # search region: zh-cn
'u=%s' % engine_language.lower(), # UI: en-us
]
params['cookies']['_EDGE_CD'] = '&'.join(_EDGE_CD) + ';'
logger.debug("cookie _EDGE_CD=%s", params['cookies']['_EDGE_CD'])
SRCHHPGUSR = [ # pylint: disable=invalid-name
'SRCHLANG=%s' % engine_language,
# Trying to set ADLT cookie here seems not to have any effect, I assume
# there is some age verification by a cookie (and/or session ID) needed,
# to disable the SafeSearch.
'ADLT=%s' % safesearch_types.get(params['safesearch'], 'DEMOTE'),
]
params['cookies']['SRCHHPGUSR'] = '&'.join(SRCHHPGUSR)
logger.debug("cookie SRCHHPGUSR=%s", params['cookies']['SRCHHPGUSR'])
def request(query, params):
"""Assemble a Bing-Web request."""
offset = _get_offset_from_pageno(params.get('pageno', 1))
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# logger.debug("params['pageno'] --> %s", params.get('pageno'))
# logger.debug(" offset --> %s", offset)
SID = uuid.uuid1().hex.upper()
CVID = uuid.uuid1().hex.upper()
search_string = page_query
if offset == 1:
search_string = inital_query
set_bing_cookies(params, engine_language, engine_region, SID)
if params['language'] == 'all':
lang = 'EN'
else:
lang = match_language(params['language'], supported_languages, language_aliases)
# build URL query
# ---------------
query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
# query term
page = int(params.get('pageno', 1))
query_params = {
# fmt: off
'q': query,
'pq': query,
'cvid': CVID,
'qs': 'n',
'sp': '-1'
# fmt: on
}
search_path = search_string.format(query=urlencode({'q': query}), offset=offset)
if offset > 1:
referer = base_url + inital_query.format(query=urlencode({'q': query}))
# page
if page > 1:
referer = base_url + '?' + urlencode(query_params)
params['headers']['Referer'] = referer
logger.debug("headers.Referer --> %s", referer)
params['url'] = base_url + search_path
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
query_params['first'] = _get_offset_from_pageno(page)
if page == 2:
query_params['FORM'] = 'PERE'
elif page > 2:
query_params['FORM'] = 'PERE%s' % (page - 2)
filters = ''
if params['time_range']:
query_params['filt'] = 'custom'
if params['time_range'] == 'day':
filters = 'ex1:"ez1"'
elif params['time_range'] == 'week':
filters = 'ex1:"ez2"'
elif params['time_range'] == 'month':
filters = 'ex1:"ez3"'
elif params['time_range'] == 'year':
epoch_1970 = datetime.date(1970, 1, 1)
today_no = (datetime.date.today() - epoch_1970).days
filters = 'ex1:"ez5_%s_%s"' % (today_no - 365, today_no)
params['url'] = base_url + '?' + urlencode(query_params)
if filters:
params['url'] = params['url'] + '&filters=' + filters
return params
@ -107,7 +212,8 @@ def response(resp):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
# Bing can shorten the URL either at the end or in the middle of the string
if (
url_cite.startswith('https://')
url_cite
and url_cite.startswith('https://')
and '' not in url_cite
and '...' not in url_cite
and '' not in url_cite
@ -127,9 +233,9 @@ def response(resp):
# resolve all Bing redirections in parallel
request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
network.Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
]
response_list = multi_requests(request_list)
response_list = network.multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
@ -157,27 +263,71 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Web."""
lang_tags = set()
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)
def _fetch_traits(engine_traits: EngineTraits, url: str, xpath_language_codes: str, xpath_market_codes: str):
# insert alias to map from a language (zh) to a language + script (zh_Hans)
engine_traits.languages['zh'] = 'zh-hans'
resp = network.get(url)
if not resp.ok:
print("ERROR: response from peertube is not OK.")
dom = html.fromstring(resp.text)
lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')
for _li in lang_links:
map_lang = {'jp': 'ja'}
for td in eval_xpath(dom, xpath_language_codes):
eng_lang = td.text
href = eval_xpath(_li, './/@href')[0]
(_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
query = parse_qs(query, keep_blank_values=True)
if eng_lang in ('en-gb', 'pt-br'):
# language 'en' is already in the list and a language 'en-gb' can't
# be handled in SearXNG, same with pt-br which is covered by pt-pt.
continue
# fmt: off
setlang = query.get('setlang', [None, ])[0]
# example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2] # fmt: skip
# fmt: on
babel_lang = map_lang.get(eng_lang, eng_lang).replace('-', '_')
try:
sxng_tag = language_tag(babel.Locale.parse(babel_lang))
except babel.UnknownLocaleError:
print("ERROR: language (%s) is unknown by babel" % (eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang
tag = lang + '-' + nation if nation else lang
lang_tags.add(tag)
map_region = {
'en-ID': 'id_ID',
'no-NO': 'nb_NO',
}
return list(lang_tags)
for td in eval_xpath(dom, xpath_market_codes):
eng_region = td.text
babel_region = map_region.get(eng_region, eng_region).replace('-', '_')
if eng_region == 'en-WW':
engine_traits.all_locale = eng_region
continue
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region))
except babel.UnknownLocaleError:
print("ERROR: region (%s) is unknown by babel" % (eng_region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_region:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_region))
continue
engine_traits.regions[sxng_tag] = eng_region

View File

@ -1,20 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing (Images)
"""Bing-Images: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from json import loads
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.utils import match_language
from searx.engines.bing import language_aliases
from searx.engines.bing import ( # pylint: disable=unused-import
_fetch_supported_languages,
supported_languages_url,
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -31,77 +41,92 @@ categories = ['images', 'web']
paging = True
safesearch = True
time_range_support = True
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
number_of_results = 28
# search-url
base_url = 'https://www.bing.com/'
search_string = (
base_url = 'https://www.bing.com/images/async'
"""Bing (Images) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-image-search/reference/market-codes'
"""Bing (Images) search API description"""
time_map = {
# fmt: off
'images/search'
'?{query}'
'&count={count}'
'&first={first}'
'&tsc=ImageHoverTitle'
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
)
time_range_string = '&qft=+filterui:age-lt{interval}'
time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'}
# safesearch definitions
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}
}
# do search-request
def request(query, params):
offset = ((params['pageno'] - 1) * number_of_results) + 1
"""Assemble a Bing-Image request."""
search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset)
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
language = match_language(params['language'], supported_languages, language_aliases).lower()
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
# build URL query
# - example: https://www.bing.com/images/async?q=foo&first=155&count=35
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&ui=' + language + '&F=1'
query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
params['url'] = base_url + search_path
if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
# time range
# - example: one year (525600 minutes) 'qft=+filterui:age-lt525600'
if params['time_range']:
query_params['qft'] = 'filterui:age-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
# get response from search-request
def response(resp):
results = []
"""Get response from Bing-Images"""
results = []
dom = html.fromstring(resp.text)
# parse results
for result in dom.xpath('//div[@class="imgpt"]'):
img_format = result.xpath('./div[contains(@class, "img_info")]/span/text()')[0]
# Microsoft seems to experiment with this code so don't make the path too specific,
# just catch the text section for the first anchor in img_info assuming this to be
# the originating site.
source = result.xpath('./div[contains(@class, "img_info")]//a/text()')[0]
for result in dom.xpath('//ul[contains(@class, "dgControl_list")]/li'):
m = loads(result.xpath('./a/@m')[0])
metadata = result.xpath('.//a[@class="iusc"]/@m')
if not metadata:
continue
# strip 'Unicode private use area' highlighting, they render to Tux
# the Linux penguin and a standing diamond on my machine...
title = m.get('t', '').replace('\ue000', '').replace('\ue001', '')
metadata = json.loads(result.xpath('.//a[@class="iusc"]/@m')[0])
title = ' '.join(result.xpath('.//div[@class="infnmpt"]//a/text()')).strip()
img_format = ' '.join(result.xpath('.//div[@class="imgpt"]/div/span/text()')).strip()
source = ' '.join(result.xpath('.//div[@class="imgpt"]//div[@class="lnkw"]//a/text()')).strip()
results.append(
{
'template': 'images.html',
'url': m['purl'],
'thumbnail_src': m['turl'],
'img_src': m['murl'],
'content': '',
'url': metadata['purl'],
'thumbnail_src': metadata['turl'],
'img_src': metadata['murl'],
'content': metadata['desc'],
'title': title,
'source': source,
'img_format': img_format,
}
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -1,24 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing (News)
"""Bing-News: description see :py:obj:`searx.engines.bing`.
"""
from urllib.parse import (
urlencode,
urlparse,
parse_qsl,
quote,
)
from datetime import datetime
from dateutil import parser
from lxml import etree
from lxml.etree import XPath
from searx.utils import match_language, eval_xpath_getindex
from searx.engines.bing import ( # pylint: disable=unused-import
language_aliases,
_fetch_supported_languages,
supported_languages_url,
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import uuid
from urllib.parse import urlencode
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -34,108 +40,111 @@ about = {
categories = ['news']
paging = True
time_range_support = True
send_accept_language_header = True
time_map = {
'day': '4',
'week': '8',
'month': '9',
}
"""A string '4' means *last hour*. We use *last hour* for ``day`` here since the
difference of *last day* and *last week* in the result list is just marginally.
"""
# search-url
base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}&format=RSS'
search_string_with_time = 'news/search?{query}&first={offset}&qft=interval%3d"{interval}"&format=RSS'
time_range_dict = {'day': '7', 'week': '8', 'month': '9'}
base_url = 'https://www.bing.com/news/infinitescrollajax'
"""Bing (News) search URL"""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-news-search/reference/market-codes'
"""Bing (News) search API description"""
def url_cleanup(url_string):
"""remove click"""
parsed_url = urlparse(url_string)
if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
query = dict(parse_qsl(parsed_url.query))
url_string = query.get('url', None)
return url_string
def image_url_cleanup(url_string):
"""replace the http://*bing.com/th?id=... by https://www.bing.com/th?id=..."""
parsed_url = urlparse(url_string)
if parsed_url.netloc.endswith('bing.com') and parsed_url.path == '/th':
query = dict(parse_qsl(parsed_url.query))
url_string = "https://www.bing.com/th?id=" + quote(query.get('id'))
return url_string
def _get_url(query, language, offset, time_range):
if time_range in time_range_dict:
search_path = search_string_with_time.format(
# fmt: off
query = urlencode({
'q': query,
'setmkt': language
}),
offset = offset,
interval = time_range_dict[time_range]
# fmt: on
)
else:
# e.g. setmkt=de-de&setlang=de
search_path = search_string.format(
# fmt: off
query = urlencode({
'q': query,
'setmkt': language
}),
offset = offset
# fmt: on
)
return base_url + search_path
mkt_alias = {
'zh': 'en-WW',
'zh-CN': 'en-WW',
}
"""Bing News has an official market code 'zh-CN' but we won't get a result with
this market code. For 'zh' and 'zh-CN' we better use the *Worldwide aggregate*
market code (en-WW).
"""
def request(query, params):
"""Assemble a Bing-News request."""
if params['time_range'] and params['time_range'] not in time_range_dict:
return params
sxng_locale = params['searxng_locale']
engine_region = traits.get_region(mkt_alias.get(sxng_locale, sxng_locale), traits.all_locale)
engine_language = traits.get_language(sxng_locale, 'en')
offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all':
language = 'en-US'
else:
language = match_language(params['language'], supported_languages, language_aliases)
params['url'] = _get_url(query, language, offset, params['time_range'])
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# build URL query
#
# example: https://www.bing.com/news/infinitescrollajax?q=london&first=1
query_params = {
# fmt: off
'q': query,
'InfiniteScroll': 1,
# to simplify the page count lets use the default of 10 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 10 + 1,
# fmt: on
}
if params['time_range']:
# qft=interval:"7"
query_params['qft'] = 'qft=interval="%s"' % time_map.get(params['time_range'], '9')
params['url'] = base_url + '?' + urlencode(query_params)
return params
def response(resp):
"""Get response from Bing-Video"""
results = []
rss = etree.fromstring(resp.content)
namespaces = rss.nsmap
for item in rss.xpath('./channel/item'):
# url / title / content
url = url_cleanup(eval_xpath_getindex(item, './link/text()', 0, default=None))
title = eval_xpath_getindex(item, './title/text()', 0, default=url)
content = eval_xpath_getindex(item, './description/text()', 0, default='')
if not resp.ok or not resp.text:
return results
# publishedDate
publishedDate = eval_xpath_getindex(item, './pubDate/text()', 0, default=None)
try:
publishedDate = parser.parse(publishedDate, dayfirst=False)
except TypeError:
publishedDate = datetime.now()
except ValueError:
publishedDate = datetime.now()
dom = html.fromstring(resp.text)
# thumbnail
thumbnail = eval_xpath_getindex(item, XPath('./News:Image/text()', namespaces=namespaces), 0, default=None)
if thumbnail is not None:
thumbnail = image_url_cleanup(thumbnail)
for newsitem in dom.xpath('//div[contains(@class, "newsitem")]'):
url = newsitem.xpath('./@url')[0]
title = ' '.join(newsitem.xpath('.//div[@class="caption"]//a[@class="title"]/text()')).strip()
content = ' '.join(newsitem.xpath('.//div[@class="snippet"]/text()')).strip()
thumbnail = None
author = newsitem.xpath('./@data-author')[0]
metadata = ' '.join(newsitem.xpath('.//div[@class="source"]/span/text()')).strip()
img_src = newsitem.xpath('.//a[@class="imagelink"]//img/@src')
if img_src:
thumbnail = 'https://www.bing.com/' + img_src[0]
# append result
if thumbnail is not None:
results.append(
{'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content, 'img_src': thumbnail}
{
'url': url,
'title': title,
'content': content,
'img_src': thumbnail,
'author': author,
'metadata': metadata,
}
)
else:
results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content})
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-News.
The :py:obj:`description <searx.engines.bing_news.bing_traits_url>` of the
first table says *"query parameter when calling the Video Search API."*
.. thats why I use the 4. table "News Category API markets" for the
``xpath_market_codes``.
"""
xpath_market_codes = '//table[4]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -1,21 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing (Videos)
"""Bing-Videos: description see :py:obj:`searx.engines.bing`.
"""
# pylint: disable=invalid-name
from json import loads
from typing import TYPE_CHECKING
import uuid
import json
from urllib.parse import urlencode
from lxml import html
from searx.utils import match_language
from searx.engines.bing import language_aliases
from searx.engines.bing import ( # pylint: disable=unused-import
_fetch_supported_languages,
supported_languages_url,
from searx.enginelib.traits import EngineTraits
from searx.engines.bing import (
set_bing_cookies,
_fetch_traits,
)
from searx.engines.bing import send_accept_language_header # pylint: disable=unused-import
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://www.bing.com/videos',
@ -26,65 +35,76 @@ about = {
"results": 'HTML',
}
# engine dependent config
categories = ['videos', 'web']
paging = True
safesearch = True
time_range_support = True
send_accept_language_header = True
number_of_results = 28
base_url = 'https://www.bing.com/'
search_string = (
base_url = 'https://www.bing.com/videos/asyncv2'
"""Bing (Videos) async search URL."""
bing_traits_url = 'https://learn.microsoft.com/en-us/bing/search-apis/bing-video-search/reference/market-codes'
"""Bing (Video) search API description"""
time_map = {
# fmt: off
'videos/search'
'?{query}'
'&count={count}'
'&first={first}'
'&scope=video'
'&FORM=QBLH'
'day': 60 * 24,
'week': 60 * 24 * 7,
'month': 60 * 24 * 31,
'year': 60 * 24 * 365,
# fmt: on
)
time_range_string = '&qft=+filterui:videoage-lt{interval}'
time_range_dict = {'day': '1440', 'week': '10080', 'month': '43200', 'year': '525600'}
# safesearch definitions
safesearch_types = {2: 'STRICT', 1: 'DEMOTE', 0: 'OFF'}
}
# do search-request
def request(query, params):
offset = ((params['pageno'] - 1) * number_of_results) + 1
"""Assemble a Bing-Video request."""
search_path = search_string.format(query=urlencode({'q': query}), count=number_of_results, first=offset)
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# safesearch cookie
params['cookies']['SRCHHPGUSR'] = 'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
SID = uuid.uuid1().hex.upper()
set_bing_cookies(params, engine_language, engine_region, SID)
# language cookie
language = match_language(params['language'], supported_languages, language_aliases).lower()
params['cookies']['_EDGE_S'] = 'mkt=' + language + '&F=1'
# build URL query
#
# example: https://www.bing.com/videos/asyncv2?q=foo&async=content&first=1&count=35
# query and paging
params['url'] = base_url + search_path
query_params = {
# fmt: off
'q': query,
'async' : 'content',
# to simplify the page count lets use the default of 35 images per page
'first' : (int(params.get('pageno', 1)) - 1) * 35 + 1,
'count' : 35,
# fmt: on
}
# time range
if params['time_range'] in time_range_dict:
params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
#
# example: one week (10080 minutes) '&qft= filterui:videoage-lt10080' '&form=VRFLTR'
if params['time_range']:
query_params['form'] = 'VRFLTR'
query_params['qft'] = ' filterui:videoage-lt%s' % time_map[params['time_range']]
params['url'] = base_url + '?' + urlencode(query_params)
return params
# get response from search-request
def response(resp):
"""Get response from Bing-Video"""
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="dg_u"]/div[contains(@class, "mc_vtvc")]'):
metadata = loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
for result in dom.xpath('//div[@class="dg_u"]//div[contains(@id, "mc_vtvc_video")]'):
metadata = json.loads(result.xpath('.//div[@class="vrhdata"]/@vrhm')[0])
info = ' - '.join(result.xpath('.//div[@class="mc_vtvc_meta_block"]//span/text()')).strip()
content = '{0} - {1}'.format(metadata['du'], info)
thumbnail = '{0}th?id={1}'.format(base_url, metadata['thid'])
thumbnail = result.xpath('.//div[contains(@class, "mc_vtvc_th")]//img/@src')[0]
results.append(
{
'url': metadata['murl'],
@ -96,3 +116,13 @@ def response(resp):
)
return results
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages and regions from Bing-Videos."""
xpath_market_codes = '//table[1]/tbody/tr/td[3]'
# xpath_country_codes = '//table[2]/tbody/tr/td[2]'
xpath_language_codes = '//table[3]/tbody/tr/td[2]'
_fetch_traits(engine_traits, bing_traits_url, xpath_language_codes, xpath_market_codes)

View File

@ -1,17 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Dailymotion (Videos)
# lint: pylint
"""
Dailymotion (Videos)
~~~~~~~~~~~~~~~~~~~~
.. _REST GET: https://developers.dailymotion.com/tools/
.. _Global API Parameters: https://developers.dailymotion.com/api/#global-parameters
.. _Video filters API: https://developers.dailymotion.com/api/#video-filters
.. _Fields selection: https://developers.dailymotion.com/api/#fields-selection
"""
from typing import Set
from typing import TYPE_CHECKING
from datetime import datetime, timedelta
from urllib.parse import urlencode
import time
import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx import network
from searx.utils import html_to_text
from searx.locales import region_tag, language_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -37,11 +55,24 @@ time_delta_dict = {
}
safesearch = True
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
safesearch_params = {
2: {'is_created_for_kids': 'true'},
1: {'is_created_for_kids': 'true'},
0: {},
}
"""True if this video is "Created for Kids" / intends to target an audience
under the age of 16 (``is_created_for_kids`` in `Video filters API`_ )
"""
# search-url
# - https://developers.dailymotion.com/tools/
# - https://www.dailymotion.com/doc/api/obj-video.html
family_filter_map = {
2: 'true',
1: 'true',
0: 'false',
}
"""By default, the family filter is turned on. Setting this parameter to
``false`` will stop filtering-out explicit content from searches and global
contexts (``family_filter`` in `Global API Parameters`_ ).
"""
result_fields = [
'allow_embed',
@ -53,27 +84,21 @@ result_fields = [
'thumbnail_360_url',
'id',
]
search_url = (
'https://api.dailymotion.com/videos?'
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
).format(
fields=','.join(result_fields),
password_protected='false',
private='false',
sort='relevance',
limit=number_of_results,
)
"""`Fields selection`_, by default, a few fields are returned. To request more
specific fields, the ``fields`` parameter is used with the list of fields
SearXNG needs in the response to build a video result list.
"""
search_url = 'https://api.dailymotion.com/videos?'
"""URL to retrieve a list of videos.
- `REST GET`_
- `Global API Parameters`_
- `Video filters API`_
"""
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
# The request query filters by 'languages' & 'country', therefore instead of
# fetching only languages we need to fetch locales.
supported_languages_url = 'https://api.dailymotion.com/locales'
supported_languages_iso639: Set[str] = set()
def init(_engine_settings):
global supported_languages_iso639
supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages])
"""URL template to embed video in SearXNG's result list."""
def request(query, params):
@ -81,34 +106,42 @@ def request(query, params):
if not query:
return False
language = params['language']
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
eng_region = traits.get_region(params['searxng_locale'], 'en_US')
eng_lang = traits.get_language(params['searxng_locale'], 'en')
language_iso639 = locale.language
if locale.language not in supported_languages_iso639:
language_iso639 = 'en'
query_args = {
args = {
'search': query,
'languages': language_iso639,
'family_filter': family_filter_map.get(params['safesearch'], 'false'),
'thumbnail_ratio': 'original', # original|widescreen|square
# https://developers.dailymotion.com/api/#video-filters
'languages': eng_lang,
'page': params['pageno'],
'password_protected': 'false',
'private': 'false',
'sort': 'relevance',
'limit': number_of_results,
'fields': ','.join(result_fields),
}
if locale.territory:
localization = locale.language + '_' + locale.territory
if localization in supported_languages:
query_args['country'] = locale.territory
args.update(safesearch_params.get(params['safesearch'], {}))
# Don't add localization and country arguments if the user does select a
# language (:de, :en, ..)
if len(params['searxng_locale'].split('-')) > 1:
# https://developers.dailymotion.com/api/#global-parameters
args['localization'] = eng_region
args['country'] = eng_region.split('_')[1]
# Insufficient rights for the `ams_country' parameter of route `GET /videos'
# 'ams_country': eng_region.split('_')[1],
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
query_args['created_after'] = datetime.timestamp(created_after)
args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(query_args)
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
params['raise_for_httperror'] = False
query_str = urlencode(args)
params['url'] = search_url + query_str
return params
@ -123,7 +156,7 @@ def response(resp):
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
network.raise_for_httperror(resp)
# parse results
for res in search_res.get('list', []):
@ -167,7 +200,53 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
response_json = resp.json()
return [item['locale'] for item in response_json['list']]
def fetch_traits(engine_traits: EngineTraits):
"""Fetch locales & languages from dailymotion.
Locales fetched from `api/locales <https://api.dailymotion.com/locales>`_.
There are duplications in the locale codes returned from Dailymotion which
can be ignored::
en_EN --> en_GB, en_US
ar_AA --> ar_EG, ar_AE, ar_SA
The language list `api/languages <https://api.dailymotion.com/languages>`_
contains over 7000 *languages* codes (see PR1071_). We use only those
language codes that are used in the locales.
.. _PR1071: https://github.com/searxng/searxng/pull/1071
"""
resp = network.get('https://api.dailymotion.com/locales')
if not resp.ok:
print("ERROR: response from dailymotion/locales is not OK.")
for item in resp.json()['list']:
eng_tag = item['locale']
if eng_tag in ('en_EN', 'ar_AA'):
continue
try:
sxng_tag = region_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: item unknown --> %s" % item)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
locale_lang_list = [x.split('_')[0] for x in engine_traits.regions.values()]
resp = network.get('https://api.dailymotion.com/languages')
if not resp.ok:
print("ERROR: response from dailymotion/languages is not OK.")
for item in resp.json()['list']:
eng_tag = item['code']
if eng_tag in locale_lang_list:
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
engine_traits.languages[sxng_tag] = eng_tag

View File

@ -63,7 +63,7 @@ def search(query, request_params):
for row in result_list:
entry = {
'query': query,
'language': request_params['language'],
'language': request_params['searxng_locale'],
'value': row.get("value"),
# choose a result template or comment out to use the *default*
'template': 'key-value.html',

View File

@ -1,71 +1,207 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DuckDuckGo Lite
"""
DuckDuckGo Lite
~~~~~~~~~~~~~~~
"""
from json import loads
from lxml.html import fromstring
from typing import TYPE_CHECKING
from urllib.parse import urlencode
import json
import babel
import lxml.html
from searx import (
network,
locales,
redislib,
)
from searx import redisdb
from searx.utils import (
dict_subset,
eval_xpath,
eval_xpath_getindex,
extract_text,
match_language,
)
from searx.network import get
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://lite.duckduckgo.com/lite/',
"wikidata_id": 'Q12805',
"official_api_documentation": 'https://duckduckgo.com/api',
"use_official_api": False,
"require_api_key": False,
"results": 'HTML',
}
send_accept_language_header = True
"""DuckDuckGo-Lite tries to guess user's prefered language from the HTTP
``Accept-Language``. Optional the user can select a region filter (but not a
language).
"""
# engine dependent config
categories = ['general', 'web']
paging = True
supported_languages_url = 'https://duckduckgo.com/util/u588.js'
time_range_support = True
send_accept_language_header = True
safesearch = True # user can't select but the results are filtered
language_aliases = {
'ar-SA': 'ar-XA',
'es-419': 'es-XL',
'ja': 'jp-JP',
'ko': 'kr-KR',
'sl-SI': 'sl-SL',
'zh-TW': 'tzh-TW',
'zh-HK': 'tzh-HK',
}
url = 'https://lite.duckduckgo.com/lite/'
# url_ping = 'https://duckduckgo.com/t/sl_l'
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}
# search-url
url = 'https://lite.duckduckgo.com/lite/'
url_ping = 'https://duckduckgo.com/t/sl_l'
# match query's language to a region code that duckduckgo will accept
def get_region_code(lang, lang_list=None):
if lang == 'all':
return None
def cache_vqd(query, value):
"""Caches a ``vqd`` value from a query.
lang_code = match_language(lang, lang_list or [], language_aliases, 'wt-WT')
lang_parts = lang_code.split('-')
The vqd value depends on the query string and is needed for the follow up
pages or the images loaded by a XMLHttpRequest:
# country code goes first
return lang_parts[1].lower() + '-' + lang_parts[0].lower()
- DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
- DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`
"""
c = redisdb.client()
if c:
logger.debug("cache vqd value: %s", value)
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
c.set(key, value, ex=600)
def get_vqd(query, headers):
"""Returns the ``vqd`` that fits to the *query*. If there is no ``vqd`` cached
(:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
response.
"""
value = None
c = redisdb.client()
if c:
key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
value = c.get(key)
if value:
value = value.decode('utf-8')
logger.debug("re-use cached vqd value: %s", value)
return value
query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query}))
res = network.get(query_url, headers=headers)
content = res.text
if content.find('vqd=\'') == -1:
raise SearxEngineAPIException('Request failed')
value = content[content.find('vqd=\'') + 5 :]
value = value[: value.find('\'')]
logger.debug("new vqd value: %s", value)
cache_vqd(query, value)
return value
def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
"""Get DuckDuckGo's language identifier from SearXNG's locale.
DuckDuckGo defines its lanaguages by region codes (see
:py:obj:`fetch_traits`).
To get region and language of a DDG service use:
.. code: python
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
It might confuse, but the ``l`` value of the cookie is what SearXNG calls
the *region*:
.. code:: python
# !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
.. hint::
`DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
selection to the user, only a region can be selected by the user
(``eng_region`` from the example above). DDG-lite stores the selected
region in a cookie::
params['cookies']['kl'] = eng_region # 'ar-es'
"""
return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default))
ddg_reg_map = {
'tw-tzh': 'zh_TW',
'hk-tzh': 'zh_HK',
'ct-ca': 'skip', # ct-ca and es-ca both map to ca_ES
'es-ca': 'ca_ES',
'id-en': 'id_ID',
'no-no': 'nb_NO',
'jp-jp': 'ja_JP',
'kr-kr': 'ko_KR',
'xa-ar': 'ar_SA',
'sl-sl': 'sl_SI',
'th-en': 'th_TH',
'vn-en': 'vi_VN',
}
ddg_lang_map = {
# use ar --> ar_EG (Egypt's arabic)
"ar_DZ": 'lang_region',
"ar_JO": 'lang_region',
"ar_SA": 'lang_region',
# use bn --> bn_BD
'bn_IN': 'lang_region',
# use de --> de_DE
'de_CH': 'lang_region',
# use en --> en_US,
'en_AU': 'lang_region',
'en_CA': 'lang_region',
'en_GB': 'lang_region',
# Esperanto
'eo_XX': 'eo',
# use es --> es_ES,
'es_AR': 'lang_region',
'es_CL': 'lang_region',
'es_CO': 'lang_region',
'es_CR': 'lang_region',
'es_EC': 'lang_region',
'es_MX': 'lang_region',
'es_PE': 'lang_region',
'es_UY': 'lang_region',
'es_VE': 'lang_region',
# use fr --> rf_FR
'fr_CA': 'lang_region',
'fr_CH': 'lang_region',
'fr_BE': 'lang_region',
# use nl --> nl_NL
'nl_BE': 'lang_region',
# use pt --> pt_PT
'pt_BR': 'lang_region',
# skip these languages
'od_IN': 'skip',
'io_XX': 'skip',
'tokipona_XX': 'skip',
}
def request(query, params):
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
# eng_lang = get_ddg_lang(traits, params['searxng_locale'])
params['url'] = url
params['method'] = 'POST'
params['data']['q'] = query
# The API is not documented, so we do some reverse engineering and emulate
@ -88,23 +224,19 @@ def request(query, params):
params['data']['s'] = offset
params['data']['dc'] = offset + 1
# request needs a vqd argument
params['data']['vqd'] = get_vqd(query, params["headers"])
# initial page does not have additional data in the input form
if params['pageno'] > 1:
# request the second page (and more pages) needs 'o' and 'api' arguments
params['data']['o'] = 'json'
params['data']['api'] = 'd.js'
# initial page does not have additional data in the input form
if params['pageno'] > 2:
# request the third page (and more pages) some more arguments
params['data']['nextParams'] = ''
params['data']['v'] = ''
params['data']['vqd'] = ''
params['data']['o'] = form_data.get('o', 'json')
params['data']['api'] = form_data.get('api', 'd.js')
params['data']['nextParams'] = form_data.get('nextParams', '')
params['data']['v'] = form_data.get('v', 'l')
region_code = get_region_code(params['language'], supported_languages)
if region_code:
params['data']['kl'] = region_code
params['cookies']['kl'] = region_code
params['data']['kl'] = eng_region
params['cookies']['kl'] = eng_region
params['data']['df'] = ''
if params['time_range'] in time_range_dict:
@ -116,26 +248,40 @@ def request(query, params):
return params
# get response from search-request
def response(resp):
headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
get(url_ping, headers=headers_ping)
if resp.status_code == 303:
return []
results = []
doc = fromstring(resp.text)
doc = lxml.html.fromstring(resp.text)
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
if not len(result_table) >= 3:
if len(result_table) == 2:
# some locales (at least China) does not have a "next page" button and
# the layout of the HTML tables is different.
result_table = result_table[1]
elif not len(result_table) >= 3:
# no more results
return []
else:
result_table = result_table[2]
# update form data from response
form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
if len(form):
form = form[0]
form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
logger.debug('form_data: %s', form_data)
value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
query = resp.search_params['data']['q']
cache_vqd(query, value)
tr_rows = eval_xpath(result_table, './/tr')
# In the last <tr> is the form of the 'previous/next page' links
tr_rows = tr_rows[:-1]
@ -172,15 +318,105 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages & regions from DuckDuckGo.
# response is a js file with regions as an embedded object
response_page = resp.text
response_page = response_page[response_page.find('regions:{') + 8 :]
response_page = response_page[: response_page.find('}') + 1]
SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
sense in a SearXNG request since SearXNG's ``all`` will not add a
``Accept-Language`` HTTP header. The value in ``engine_traits.all_locale``
is ``wt-wt`` (the region).
regions_json = loads(response_page)
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
Beside regions DuckDuckGo also defines its lanaguages by region codes. By
example these are the english languages in DuckDuckGo:
return list(supported_languages)
- en_US
- en_AU
- en_CA
- en_GB
The function :py:obj:`get_ddg_lang` evaluates DuckDuckGo's language from
SearXNG's locale.
"""
# pylint: disable=too-many-branches, too-many-statements
# fetch regions
engine_traits.all_locale = 'wt-wt'
# updated from u588 to u661 / should be updated automatically?
resp = network.get('https://duckduckgo.com/util/u661.js')
if not resp.ok:
print("ERROR: response from DuckDuckGo is not OK.")
pos = resp.text.find('regions:{') + 8
js_code = resp.text[pos:]
pos = js_code.find('}') + 1
regions = json.loads(js_code[:pos])
for eng_tag, name in regions.items():
if eng_tag == 'wt-wt':
engine_traits.all_locale = 'wt-wt'
continue
region = ddg_reg_map.get(eng_tag)
if region == 'skip':
continue
if not region:
eng_territory, eng_lang = eng_tag.split('-')
region = eng_lang + '_' + eng_territory.upper()
try:
sxng_tag = locales.region_tag(babel.Locale.parse(region))
except babel.UnknownLocaleError:
print("ERROR: %s (%s) -> %s is unknown by babel" % (name, eng_tag, region))
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# fetch languages
engine_traits.custom['lang_region'] = {}
pos = resp.text.find('languages:{') + 10
js_code = resp.text[pos:]
pos = js_code.find('}') + 1
js_code = '{"' + js_code[1:pos].replace(':', '":').replace(',', ',"')
languages = json.loads(js_code)
for eng_lang, name in languages.items():
if eng_lang == 'wt_WT':
continue
babel_tag = ddg_lang_map.get(eng_lang, eng_lang)
if babel_tag == 'skip':
continue
try:
if babel_tag == 'lang_region':
sxng_tag = locales.region_tag(babel.Locale.parse(eng_lang))
engine_traits.custom['lang_region'][sxng_tag] = eng_lang
continue
sxng_tag = locales.language_tag(babel.Locale.parse(babel_tag))
except babel.UnknownLocaleError:
print("ERROR: language %s (%s) is unknown by babel" % (name, eng_lang))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
continue
engine_traits.languages[sxng_tag] = eng_lang

View File

@ -1,22 +1,33 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DuckDuckGo (Instant Answer API)
"""
DuckDuckGo Instant Answer API
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The `DDG-API <https://duckduckgo.com/api>`__ is no longer documented but from
reverse engineering we can see that some services (e.g. instant answers) still
in use from the DDG search engine.
As far we can say the *instant answers* API does not support languages, or at
least we could not find out how language support should work. It seems that
most of the features are based on English terms.
"""
import json
from typing import TYPE_CHECKING
from urllib.parse import urlencode, urlparse, urljoin
from lxml import html
from searx.data import WIKIDATA_UNITS
from searx.engines.duckduckgo import language_aliases
from searx.engines.duckduckgo import ( # pylint: disable=unused-import
_fetch_supported_languages,
supported_languages_url,
)
from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
from searx.utils import extract_text, html_to_text, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
if TYPE_CHECKING:
import logging
logger: logging.Logger
# about
about = {
"website": 'https://duckduckgo.com/',
@ -37,7 +48,7 @@ replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
def is_broken_text(text):
"""duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
"""duckduckgo may return something like ``<a href="xxxx">http://somewhere Related website<a/>``
The href URL is broken, the "Related website" may contains some HTML.
@ -62,8 +73,6 @@ def result_to_text(text, htmlResult):
def request(query, params):
params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0]
return params
@ -71,7 +80,7 @@ def response(resp):
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
results = []
search_res = json.loads(resp.text)
search_res = resp.json()
# search_res.get('Entity') possible values (not exhaustive) :
# * continent / country / department / location / waterfall
@ -235,7 +244,7 @@ def unit_to_str(unit):
def area_to_str(area):
"""parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
"""parse ``{'unit': 'https://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}``"""
unit = unit_to_str(area.get('unit'))
if unit is not None:
try:

View File

@ -1,26 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
DuckDuckGo (Images)
DuckDuckGo Images
~~~~~~~~~~~~~~~~~
"""
from json import loads
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from searx.exceptions import SearxEngineAPIException
from searx.engines.duckduckgo import get_region_code
from searx.engines.duckduckgo import ( # pylint: disable=unused-import
_fetch_supported_languages,
supported_languages_url,
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import (
get_ddg_lang,
get_vqd,
)
from searx.network import get
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
"official_api_documentation": {
'url': 'https://duckduckgo.com/api',
'comment': 'but images are not supported',
},
"use_official_api": False,
"require_api_key": False,
"results": 'JSON (site requires js to get images)',
@ -32,70 +36,64 @@ paging = True
safesearch = True
send_accept_language_header = True
# search-url
images_url = 'https://duckduckgo.com/i.js?{query}&s={offset}&p={safesearch}&o=json&vqd={vqd}'
site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
safesearch_cookies = {0: '-2', 1: None, 2: '1'}
safesearch_args = {0: '1', 1: None, 2: '1'}
# run query in site to get vqd number needed for requesting images
# TODO: find a way to get this number without an extra request (is it a hash of the query?)
def get_vqd(query, headers):
query_url = site_url.format(query=urlencode({'q': query}))
res = get(query_url, headers=headers)
content = res.text
if content.find('vqd=\'') == -1:
raise SearxEngineAPIException('Request failed')
vqd = content[content.find('vqd=\'') + 5 :]
vqd = vqd[: vqd.find('\'')]
return vqd
# do search-request
def request(query, params):
# to avoid running actual external requests when testing
if 'is_test' not in params:
vqd = get_vqd(query, params['headers'])
else:
vqd = '12345'
offset = (params['pageno'] - 1) * 50
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
safesearch = params['safesearch'] - 1
args = {
'q': query,
'o': 'json',
# 'u': 'bing',
'l': eng_region,
'vqd': get_vqd(query, params["headers"]),
}
region_code = get_region_code(params['language'], lang_list=supported_languages)
if region_code:
params['url'] = images_url.format(
query=urlencode({'q': query, 'l': region_code}), offset=offset, safesearch=safesearch, vqd=vqd
)
else:
params['url'] = images_url.format(query=urlencode({'q': query}), offset=offset, safesearch=safesearch, vqd=vqd)
if params['pageno'] > 1:
args['s'] = (params['pageno'] - 1) * 100
params['cookies']['ad'] = eng_lang # zh_CN
params['cookies']['ah'] = eng_region # "us-en,de-de"
params['cookies']['l'] = eng_region # "hk-tzh"
logger.debug("cookies: %s", params['cookies'])
safe_search = safesearch_cookies.get(params['safesearch'])
if safe_search is not None:
params['cookies']['p'] = safe_search # "-2", "1"
safe_search = safesearch_args.get(params['safesearch'])
if safe_search is not None:
args['p'] = safe_search # "-1", "1"
args = urlencode(args)
params['url'] = 'https://duckduckgo.com/i.js?{args}&f={f}'.format(args=args, f=',,,,,')
params['headers']['Accept'] = 'application/json, text/javascript, */*; q=0.01'
params['headers']['Referer'] = 'https://duckduckgo.com/'
params['headers']['X-Requested-With'] = 'XMLHttpRequest'
logger.debug("headers: %s", params['headers'])
return params
# get response from search-request
def response(resp):
results = []
res_json = resp.json()
content = resp.text
res_json = loads(content)
# parse results
for result in res_json['results']:
title = result['title']
url = result['url']
thumbnail = result['thumbnail']
image = result['image']
# append result
results.append(
{
'template': 'images.html',
'title': title,
'title': result['title'],
'content': '',
'thumbnail_src': thumbnail,
'img_src': image,
'url': url,
'thumbnail_src': result['thumbnail'],
'img_src': result['image'],
'url': result['url'],
'img_format': '%s x %s' % (result['width'], result['height']),
'source': result['source'],
}
)

View File

@ -1,13 +1,29 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DuckDuckGo Weather"""
"""
DuckDuckGo Weather
~~~~~~~~~~~~~~~~~~
"""
from typing import TYPE_CHECKING
from json import loads
from urllib.parse import quote
from datetime import datetime
from flask_babel import gettext
from searx.engines.duckduckgo import fetch_traits # pylint: disable=unused-import
from searx.engines.duckduckgo import get_ddg_lang
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
"website": 'https://duckduckgo.com/',
"wikidata_id": 'Q12805',
@ -17,9 +33,11 @@ about = {
"results": "JSON",
}
categories = ["others"]
send_accept_language_header = True
url = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
# engine dependent config
categories = ["others"]
URL = "https://duckduckgo.com/js/spice/forecast/{query}/{lang}"
def generate_condition_table(condition):
@ -72,8 +90,17 @@ def generate_day_table(day):
def request(query, params):
params["url"] = url.format(query=quote(query), lang=params['language'].split('-')[0])
eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
eng_lang = get_ddg_lang(traits, params['searxng_locale'])
# !ddw paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
params['cookies']['ad'] = eng_lang
params['cookies']['ah'] = eng_region
params['cookies']['l'] = eng_region
logger.debug("cookies: %s", params['cookies'])
params["url"] = URL.format(query=quote(query), lang=eng_lang.split('_')[0])
return params

View File

@ -25,6 +25,7 @@ base_url = 'https://wiki.gentoo.org'
# xpath queries
xpath_results = '//ul[@class="mw-search-results"]/li'
xpath_link = './/div[@class="mw-search-result-heading"]/a'
xpath_content = './/div[@class="searchresult"]'
# cut 'en' from 'en-US', 'de' from 'de-CH', and so on
@ -77,8 +78,6 @@ main_langs = {
'uk': 'Українська',
'zh': '简体中文',
}
supported_languages = dict(lang_urls, **main_langs)
# do search-request
def request(query, params):
@ -118,7 +117,8 @@ def response(resp):
link = result.xpath(xpath_link)[0]
href = urljoin(base_url, link.attrib.get('href'))
title = extract_text(link)
content = extract_text(result.xpath(xpath_content))
results.append({'url': href, 'title': title})
results.append({'url': href, 'title': title, 'content': content})
return results

View File

@ -1,34 +1,39 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google WEB engine. Some of this
implementations are shared by other engines:
"""This is the implementation of the Google WEB engine. Some of this
implementations (manly the :py:obj:`get_google_info`) are shared by other
engines:
- :ref:`google images engine`
- :ref:`google news engine`
- :ref:`google videos engine`
The google WEB engine itself has a special setup option:
.. code:: yaml
- name: google
...
use_mobile_ui: false
``use_mobile_ui``: (default: ``false``)
Enables to use *mobile endpoint* to bypass the google blocking (see
:issue:`159`). On the mobile UI of Google Search, the button :guilabel:`More
results` is not affected by Google rate limiting and we can still do requests
while actively blocked by the original Google search. By activate
``use_mobile_ui`` this behavior is simulated by adding the parameter
``async=use_ac:true,_fmt:pc`` to the :py:func:`request`.
- :ref:`google scholar engine`
- :ref:`google autocomplete`
"""
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
from lxml import html
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
import babel
import babel.core
import babel.languages
from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.locales import language_tag, region_tag, get_offical_locales
from searx import network
from searx.exceptions import SearxEngineCaptchaException
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -45,64 +50,6 @@ categories = ['general', 'web']
paging = True
time_range_support = True
safesearch = True
send_accept_language_header = True
use_mobile_ui = False
supported_languages_url = 'https://www.google.com/preferences?#languages'
# based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
google_domains = {
'BG': 'google.bg', # Bulgaria
'CZ': 'google.cz', # Czech Republic
'DE': 'google.de', # Germany
'DK': 'google.dk', # Denmark
'AT': 'google.at', # Austria
'CH': 'google.ch', # Switzerland
'GR': 'google.gr', # Greece
'AU': 'google.com.au', # Australia
'CA': 'google.ca', # Canada
'GB': 'google.co.uk', # United Kingdom
'ID': 'google.co.id', # Indonesia
'IE': 'google.ie', # Ireland
'IN': 'google.co.in', # India
'MY': 'google.com.my', # Malaysia
'NZ': 'google.co.nz', # New Zealand
'PH': 'google.com.ph', # Philippines
'SG': 'google.com.sg', # Singapore
'US': 'google.com', # United States (google.us) redirects to .com
'ZA': 'google.co.za', # South Africa
'AR': 'google.com.ar', # Argentina
'CL': 'google.cl', # Chile
'ES': 'google.es', # Spain
'MX': 'google.com.mx', # Mexico
'EE': 'google.ee', # Estonia
'FI': 'google.fi', # Finland
'BE': 'google.be', # Belgium
'FR': 'google.fr', # France
'IL': 'google.co.il', # Israel
'HR': 'google.hr', # Croatia
'HU': 'google.hu', # Hungary
'IT': 'google.it', # Italy
'JP': 'google.co.jp', # Japan
'KR': 'google.co.kr', # South Korea
'LT': 'google.lt', # Lithuania
'LV': 'google.lv', # Latvia
'NO': 'google.no', # Norway
'NL': 'google.nl', # Netherlands
'PL': 'google.pl', # Poland
'BR': 'google.com.br', # Brazil
'PT': 'google.pt', # Portugal
'RO': 'google.ro', # Romania
'RU': 'google.ru', # Russia
'SK': 'google.sk', # Slovakia
'SI': 'google.si', # Slovenia
'SE': 'google.se', # Sweden
'TH': 'google.co.th', # Thailand
'TR': 'google.com.tr', # Turkey
'UA': 'google.com.ua', # Ukraine
'CN': 'google.com.hk', # There is no google.cn, we use .com.hk for zh-CN
'HK': 'google.com.hk', # Hong Kong
'TW': 'google.com.tw', # Taiwan
}
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
@ -112,50 +59,50 @@ filter_mapping = {0: 'off', 1: 'medium', 2: 'high'}
# specific xpath variables
# ------------------------
results_xpath = './/div[@data-sokoban-container]'
results_xpath = './/div[contains(@jscontroller, "SC7lYd")]'
title_xpath = './/a/h3[1]'
href_xpath = './/a[h3]/@href'
content_xpath = './/div[@data-content-feature=1]'
# google *sections* are no usual *results*, we ignore them
g_section_with_header = './g-section-with-header'
content_xpath = './/div[@data-sncf]'
# Suggestions are links placed in a *card-section*, we extract only the text
# from the links not the links itself.
suggestion_xpath = '//div[contains(@class, "EIaa9b")]//a'
# UI_ASYNC = 'use_ac:true,_fmt:html' # returns a HTTP 500 when user search for
# # celebrities like '!google natasha allegri'
# # or '!google chris evans'
UI_ASYNC = 'use_ac:true,_fmt:prog'
"""Format of the response from UI's async request."""
def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
"""Composing various language properties for the google engines.
def get_google_info(params, eng_traits):
"""Composing various (language) properties for the google engines (:ref:`google
API`).
This function is called by the various google engines (:ref:`google web
engine`, :ref:`google images engine`, :ref:`google news engine` and
:ref:`google videos engine`).
:param dict param: request parameters of the engine
:param dict param: Request parameters of the engine. At least
a ``searxng_locale`` key should be in the dictionary.
:param list lang_list: list of supported languages of the engine
:py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>`
:param dict lang_list: custom aliases for non standard language codes
(used when calling :py:func:`searx.utils.match_language`)
:param bool supported_any_language: When a language is not specified, the
language interpretation is left up to Google to decide how the search
results should be delivered. This argument is ``True`` for the google
engine and ``False`` for the other engines (google-images, -news,
-scholar, -videos).
:param eng_traits: Engine's traits fetched from google preferences
(:py:obj:`searx.enginelib.traits.EngineTraits`)
:rtype: dict
:returns:
Py-Dictionary with the key/value pairs:
language:
Return value from :py:func:`searx.utils.match_language`
The language code that is used by google (e.g. ``lang_en`` or
``lang_zh-TW``)
country:
The country code (e.g. US, AT, CA, FR, DE ..)
The country code that is used by google (e.g. ``US`` or ``TW``)
locale:
A instance of :py:obj:`babel.core.Locale` build from the
``searxng_locale`` value.
subdomain:
Google subdomain :py:obj:`google_domains` that fits to the country
@ -165,52 +112,67 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
Py-Dictionary with additional request arguments (can be passed to
:py:func:`urllib.parse.urlencode`).
- ``hl`` parameter: specifies the interface language of user interface.
- ``lr`` parameter: restricts search results to documents written in
a particular language.
- ``cr`` parameter: restricts search results to documents
originating in a particular country.
- ``ie`` parameter: sets the character encoding scheme that should
be used to interpret the query string ('utf8').
- ``oe`` parameter: sets the character encoding scheme that should
be used to decode the XML result ('utf8').
headers:
Py-Dictionary with additional HTTP headers (can be passed to
request's headers)
- ``Accept: '*/*``
"""
ret_val = {
'language': None,
'country': None,
'subdomain': None,
'params': {},
'headers': {},
'cookies': {},
'locale': None,
}
# language ...
sxng_locale = params.get('searxng_locale', 'all')
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.core.UnknownLocaleError:
locale = None
_lang = params['language']
_any_language = _lang.lower() == 'all'
if _any_language:
_lang = 'en-US'
language = match_language(_lang, lang_list, custom_aliases)
ret_val['language'] = language
eng_lang = eng_traits.get_language(sxng_locale, 'lang_en')
lang_code = eng_lang.split('_')[-1] # lang_zh-TW --> zh-TW / lang_en --> en
country = eng_traits.get_region(sxng_locale, eng_traits.all_locale)
# country ...
# Test zh_hans & zh_hant --> in the topmost links in the result list of list
# TW and HK you should a find wiktionary.org zh_hant link. In the result
# list of zh-CN should not be no hant link instead you should find
# zh.m.wikipedia.org/zh somewhere in the top.
_l = _lang.split('-')
if len(_l) == 2:
country = _l[1]
else:
country = _l[0].upper()
if country == 'EN':
country = 'US'
# '!go 日 :zh-TW' --> https://zh.m.wiktionary.org/zh-hant/%E6%97%A5
# '!go 日 :zh-CN' --> https://zh.m.wikipedia.org/zh/%E6%97%A5
ret_val['language'] = eng_lang
ret_val['country'] = country
# subdomain ...
ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com')
# params & headers
lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..)
ret_val['locale'] = locale
ret_val['subdomain'] = eng_traits.custom['supported_domains'].get(country.upper(), 'www.google.com')
# hl parameter:
# https://developers.google.com/custom-search/docs/xml_results#hlsp The
# Interface Language:
# The hl parameter specifies the interface language (host language) of
# your user interface. To improve the performance and the quality of your
# search results, you are strongly encouraged to set this parameter
# explicitly.
# https://developers.google.com/custom-search/docs/xml_results#hlsp
# The Interface Language:
# https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages
ret_val['params']['hl'] = lang_list.get(lang_country, language)
ret_val['params']['hl'] = lang_code
# lr parameter:
# The lr (language restrict) parameter restricts search results to
@ -218,22 +180,72 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
# https://developers.google.com/custom-search/docs/xml_results#lrsp
# Language Collection Values:
# https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections
if _any_language and supported_any_language:
# interpretation is left up to Google (based on whoogle)
#
# - add parameter ``source=lnt``
# - don't use parameter ``lr``
# - don't add a ``Accept-Language`` HTTP header.
# To select 'all' languages an empty 'lr' value is used.
#
# Different to other google services, Google Schloar supports to select more
# than one language. The languages are seperated by a pipe '|' (logical OR).
# By example: &lr=lang_zh-TW%7Clang_de selects articles written in
# traditional chinese OR german language.
ret_val['params']['source'] = 'lnt'
ret_val['params']['lr'] = eng_lang
if sxng_locale == 'all':
ret_val['params']['lr'] = ''
else:
# cr parameter:
# The cr parameter restricts search results to documents originating in a
# particular country.
# https://developers.google.com/custom-search/docs/xml_results#crsp
# restricts search results to documents written in a particular
# language.
ret_val['params']['lr'] = "lang_" + lang_list.get(lang_country, language)
ret_val['params']['cr'] = 'country' + country
if sxng_locale == 'all':
ret_val['params']['cr'] = ''
# gl parameter: (mandatory by Geeogle News)
# The gl parameter value is a two-letter country code. For WebSearch
# results, the gl parameter boosts search results whose country of origin
# matches the parameter value. See the Country Codes section for a list of
# valid values.
# Specifying a gl parameter value in WebSearch requests should improve the
# relevance of results. This is particularly true for international
# customers and, even more specifically, for customers in English-speaking
# countries other than the United States.
# https://developers.google.com/custom-search/docs/xml_results#glsp
ret_val['params']['gl'] = country
# ie parameter:
# The ie parameter sets the character encoding scheme that should be used
# to interpret the query string. The default ie value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#iesp
ret_val['params']['ie'] = 'utf8'
# oe parameter:
# The oe parameter sets the character encoding scheme that should be used
# to decode the XML result. The default oe value is latin1.
# https://developers.google.com/custom-search/docs/xml_results#oesp
ret_val['params']['oe'] = 'utf8'
# num parameter:
# The num parameter identifies the number of search results to return.
# The default num value is 10, and the maximum value is 20. If you request
# more than 20 results, only 20 results will be returned.
# https://developers.google.com/custom-search/docs/xml_results#numsp
# HINT: seems to have no effect (tested in google WEB & Images)
# ret_val['params']['num'] = 20
# HTTP headers
ret_val['headers']['Accept'] = '*/*'
# Cookies
# - https://github.com/searxng/searxng/pull/1679#issuecomment-1235432746
# - https://github.com/searxng/searxng/issues/1555
ret_val['cookies']['CONSENT'] = "YES+"
return ret_val
@ -245,33 +257,34 @@ def detect_google_sorry(resp):
def request(query, params):
"""Google search request"""
# pylint: disable=line-too-long
offset = (params['pageno'] - 1) * 10
lang_info = get_lang_info(params, supported_languages, language_aliases, True)
additional_parameters = {}
if use_mobile_ui:
additional_parameters = {
'asearch': 'arc',
'async': 'use_ac:true,_fmt:prog',
}
google_info = get_google_info(params, traits)
# https://www.google.de/search?q=corona&hl=de&lr=lang_de&start=0&tbs=qdr%3Ad&safe=medium
query_url = (
'https://'
+ lang_info['subdomain']
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
'start': offset,
**google_info['params'],
'filter': '0',
**additional_parameters,
'start': offset,
# 'vet': '12ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0QxK8CegQIARAC..i',
# 'ved': '2ahUKEwik3ZbIzfn7AhXMX_EDHbUDBh0Q_skCegQIARAG',
# 'cs' : 1,
# 'sa': 'N',
# 'yv': 3,
# 'prmd': 'vin',
# 'ei': 'GASaY6TxOcy_xc8PtYeY6AE',
# 'sa': 'N',
# 'sstk': 'AcOHfVkD7sWCSAheZi-0tx_09XDO55gTWY0JNq3_V26cNN-c8lfD45aZYPI8s_Bqp8s57AHz5pxchDtAGCA_cikAWSjy9kw3kgg'
# formally known as use_mobile_ui
'asearch': 'arc',
'async': UI_ASYNC,
}
)
)
@ -282,25 +295,38 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
if use_mobile_ui:
params['headers']['Accept'] = '*/*'
else:
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
# =26;[3,"dimg_ZNMiZPCqE4apxc8P3a2tuAQ_137"]a87;data:image/jpeg;base64,/9j/4AAQSkZJRgABA
# ...6T+9Nl4cnD+gr9OK8I56/tX3l86nWYw//2Q==26;
RE_DATA_IMAGE = re.compile(r'"(dimg_[^"]*)"[^;]*;(data:image[^;]*;[^;]*);')
def _parse_data_images(dom):
data_image_map = {}
for img_id, data_image in RE_DATA_IMAGE.findall(dom.text_content()):
end_pos = data_image.rfind('=')
if end_pos > 0:
data_image = data_image[: end_pos + 1]
data_image_map[img_id] = data_image
logger.debug('data:image objects --> %s', list(data_image_map.keys()))
return data_image_map
def response(resp):
"""Get response from google's search request"""
# pylint: disable=too-many-branches, too-many-statements
detect_google_sorry(resp)
results = []
# convert the text to dom
dom = html.fromstring(resp.text)
data_image_map = _parse_data_images(dom)
# results --> answer
answer_list = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]')
if answer_list:
@ -309,25 +335,9 @@ def response(resp):
else:
logger.debug("did not find 'answer'")
# results --> number_of_results
if not use_mobile_ui:
try:
_txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0)
_digit = ''.join([n for n in _txt if n.isdigit()])
number_of_results = int(_digit)
results.append({'number_of_results': number_of_results})
except Exception as e: # pylint: disable=broad-except
logger.debug("did not 'number_of_results'")
logger.error(e, exc_info=True)
# parse results
for result in eval_xpath_list(dom, results_xpath):
# google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ignoring <g-section-with-header>")
continue
for result in eval_xpath_list(dom, results_xpath): # pylint: disable=too-many-nested-blocks
try:
title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None)
@ -336,16 +346,30 @@ def response(resp):
logger.debug('ignoring item from the result_xpath list: missing title')
continue
title = extract_text(title_tag)
url = eval_xpath_getindex(result, href_xpath, 0, None)
if url is None:
logger.debug('ignoring item from the result_xpath list: missing url of title "%s"', title)
continue
content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True)
if content is None:
content_nodes = eval_xpath(result, content_xpath)
content = extract_text(content_nodes)
if not content:
logger.debug('ignoring item from the result_xpath list: missing content of title "%s"', title)
continue
logger.debug('add link to results: %s', title)
results.append({'url': url, 'title': title, 'content': content})
img_src = content_nodes[0].xpath('.//img/@src')
if img_src:
img_src = img_src[0]
if img_src.startswith('data:image'):
img_id = content_nodes[0].xpath('.//img/@id')
if img_id:
img_src = data_image_map.get(img_id[0])
else:
img_src = None
results.append({'url': url, 'title': title, 'content': content, 'img_src': img_src})
except Exception as e: # pylint: disable=broad-except
logger.error(e, exc_info=True)
@ -361,15 +385,107 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
ret_val = {}
skip_countries = [
# official language of google-country not in google-languages
'AL', # Albanien (sq)
'AZ', # Aserbaidschan (az)
'BD', # Bangladesch (bn)
'BN', # Brunei Darussalam (ms)
'BT', # Bhutan (dz)
'ET', # Äthiopien (am)
'GE', # Georgien (ka, os)
'GL', # Grönland (kl)
'KH', # Kambodscha (km)
'LA', # Laos (lo)
'LK', # Sri Lanka (si, ta)
'ME', # Montenegro (sr)
'MK', # Nordmazedonien (mk, sq)
'MM', # Myanmar (my)
'MN', # Mongolei (mn)
'MV', # Malediven (dv) // dv_MV is unknown by babel
'MY', # Malaysia (ms)
'NP', # Nepal (ne)
'TJ', # Tadschikistan (tg)
'TM', # Turkmenistan (tk)
'UZ', # Usbekistan (uz)
]
def fetch_traits(engine_traits: EngineTraits, add_domains: bool = True):
"""Fetch languages from Google."""
# pylint: disable=import-outside-toplevel, too-many-branches
engine_traits.custom['supported_domains'] = {}
resp = network.get('https://www.google.com/preferences')
if not resp.ok:
raise RuntimeError("Response from Google's preferences is not OK.")
dom = html.fromstring(resp.text)
radio_buttons = eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]')
# supported language codes
for x in radio_buttons:
name = x.get("data-name")
code = x.get("value").split('_')[-1]
ret_val[code] = {"name": name}
lang_map = {'no': 'nb'}
for x in eval_xpath_list(dom, '//*[@id="langSec"]//input[@name="lr"]'):
return ret_val
eng_lang = x.get("value").split('_')[-1]
try:
locale = babel.Locale.parse(lang_map.get(eng_lang, eng_lang), sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (x.get("data-name"), eng_lang))
continue
sxng_lang = language_tag(locale)
conflict = engine_traits.languages.get(sxng_lang)
if conflict:
if conflict != eng_lang:
print("CONFLICT: babel %s --> %s, %s" % (sxng_lang, conflict, eng_lang))
continue
engine_traits.languages[sxng_lang] = 'lang_' + eng_lang
# alias languages
engine_traits.languages['zh'] = 'lang_zh-CN'
# supported region codes
for x in eval_xpath_list(dom, '//*[@name="region"]/..//input[@name="region"]'):
eng_country = x.get("value")
if eng_country in skip_countries:
continue
if eng_country == 'ZZ':
engine_traits.all_locale = 'ZZ'
continue
sxng_locales = get_offical_locales(eng_country, engine_traits.languages.keys(), regional=True)
if not sxng_locales:
print("ERROR: can't map from google country %s (%s) to a babel region." % (x.get('data-name'), eng_country))
continue
for sxng_locale in sxng_locales:
engine_traits.regions[region_tag(sxng_locale)] = eng_country
# alias regions
engine_traits.regions['zh-CN'] = 'HK'
# supported domains
if add_domains:
resp = network.get('https://www.google.com/supported_domains')
if not resp.ok:
raise RuntimeError("Response from https://www.google.com/supported_domains is not OK.")
for domain in resp.text.split():
domain = domain.strip()
if not domain or domain in [
'.google.com',
]:
continue
region = domain.split('.')[-1].upper()
engine_traits.custom['supported_domains'][region] = 'www' + domain
if region == 'HK':
# There is no google.cn, we use .com.hk for zh-CN
engine_traits.custom['supported_domains']['CN'] = 'www' + domain

View File

@ -1,31 +1,38 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google images engine using the google
internal API used the Google Go Android app.
"""This is the implementation of the Google Images engine using the internal
Google API used by the Google Go Android app.
This internal API offer results in
- JSON (_fmt:json)
- Protobuf (_fmt:pb)
- Protobuf compressed? (_fmt:pc)
- HTML (_fmt:html)
- Protobuf encoded in JSON (_fmt:jspb).
- JSON (``_fmt:json``)
- Protobuf_ (``_fmt:pb``)
- Protobuf_ compressed? (``_fmt:pc``)
- HTML (``_fmt:html``)
- Protobuf_ encoded in JSON (``_fmt:jspb``).
.. _Protobuf: https://en.wikipedia.org/wiki/Protocol_Buffers
"""
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from json import loads
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_lang_info,
get_google_info,
time_range_dict,
detect_google_sorry,
)
# pylint: disable=unused-import
from searx.engines.google import supported_languages_url, _fetch_supported_languages
if TYPE_CHECKING:
import logging
from searx.enginelib.traits import EngineTraits
logger: logging.Logger
traits: EngineTraits
# pylint: enable=unused-import
# about
about = {
@ -40,7 +47,6 @@ about = {
# engine dependent config
categories = ['images', 'web']
paging = True
use_locale_domain = True
time_range_support = True
safesearch = True
send_accept_language_header = True
@ -51,20 +57,18 @@ filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
def request(query, params):
"""Google-Image search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ lang_info['subdomain']
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode(
{
'q': query,
'tbm': "isch",
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
**google_info['params'],
'asearch': 'isch',
'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
}
@ -77,9 +81,8 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['headers'].update(lang_info['headers'])
params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
params['headers']['Accept'] = '*/*'
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
@ -111,7 +114,11 @@ def response(resp):
copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
if copyright_notice:
result_item['source'] += ' / ' + copyright_notice
result_item['source'] += ' | ' + copyright_notice
freshness_date = item["result"].get("freshness_date")
if freshness_date:
result_item['source'] += ' | ' + freshness_date
file_size = item.get('gsa', {}).get('file_size')
if file_size:

View File

@ -1,24 +1,40 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google news engine. The google news API
ignores some parameters from the common :ref:`google API`:
"""This is the implementation of the Google News engine.
- num_ : the number of search results is ignored
Google News has a different region handling compared to Google WEB.
- the ``ceid`` argument has to be set (:py:obj:`ceid_list`)
- the hl_ argument has to be set correctly (and different to Google WEB)
- the gl_ argument is mandatory
If one of this argument is not set correctly, the request is redirected to
CONSENT dialog::
https://consent.google.com/m?continue=
The google news API ignores some parameters from the common :ref:`google API`:
- num_ : the number of search results is ignored / there is no paging all
results for a query term are in the first response.
- save_ : is ignored / Google-News results are always *SafeSearch*
.. _hl: https://developers.google.com/custom-search/docs/xml_results#hlsp
.. _gl: https://developers.google.com/custom-search/docs/xml_results#glsp
.. _num: https://developers.google.com/custom-search/docs/xml_results#numsp
.. _save: https://developers.google.com/custom-search/docs/xml_results#safesp
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import binascii
import re
from urllib.parse import urlencode
from base64 import b64decode
from lxml import html
import babel
from searx import locales
from searx.utils import (
eval_xpath,
eval_xpath_list,
@ -26,18 +42,19 @@ from searx.utils import (
extract_text,
)
# pylint: disable=unused-import
from searx.engines.google import fetch_traits as _fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url,
_fetch_supported_languages,
)
# pylint: enable=unused-import
from searx.engines.google import (
get_lang_info,
get_google_info,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -49,70 +66,77 @@ about = {
"results": 'HTML',
}
# compared to other google engines google-news has a different time range
# support. The time range is included in the search term.
time_range_dict = {
'day': 'when:1d',
'week': 'when:7d',
'month': 'when:1m',
'year': 'when:1y',
}
# engine dependent config
categories = ['news']
paging = False
use_locale_domain = True
time_range_support = True
time_range_support = False
# Google-News results are always *SafeSearch*. Option 'safesearch' is set to
# False here, otherwise checker will report safesearch-errors::
#
# safesearch : results are identitical for safesearch=0 and safesearch=2
safesearch = False
send_accept_language_header = True
safesearch = True
# send_accept_language_header = True
def request(query, params):
"""Google-News search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
sxng_locale = params.get('searxng_locale', 'en-US')
ceid = locales.get_engine_locale(sxng_locale, traits.custom['ceid'], default='US:en')
google_info = get_google_info(params, traits)
google_info['subdomain'] = 'news.google.com' # google news has only one domain
# google news has only one domain
lang_info['subdomain'] = 'news.google.com'
ceid_region, ceid_lang = ceid.split(':')
ceid_lang, ceid_suffix = (
ceid_lang.split('-')
+ [
None,
]
)[:2]
ceid = "%s:%s" % (lang_info['country'], lang_info['language'])
google_info['params']['hl'] = ceid_lang
# google news redirects en to en-US
if lang_info['params']['hl'] == 'en':
lang_info['params']['hl'] = 'en-US'
if ceid_suffix and ceid_suffix not in ['Hans', 'Hant']:
# Very special to google-news compared to other google engines, the time
# range is included in the search term.
if params['time_range']:
query += ' ' + time_range_dict[params['time_range']]
if ceid_region.lower() == ceid_lang:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_suffix
elif ceid_region.lower() != ceid_lang:
if ceid_region in ['AT', 'BE', 'CH', 'IL', 'SA', 'IN', 'BD', 'PT']:
google_info['params']['hl'] = ceid_lang
else:
google_info['params']['hl'] = ceid_lang + '-' + ceid_region
google_info['params']['lr'] = 'lang_' + ceid_lang.split('-')[0]
google_info['params']['gl'] = ceid_region
query_url = (
'https://'
+ lang_info['subdomain']
+ '/search'
+ "?"
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'gl': lang_info['country']})
+ google_info['subdomain']
+ "/search?"
+ urlencode(
{
'q': query,
**google_info['params'],
}
)
# ceid includes a ':' character which must not be urlencoded
+ ('&ceid=%s' % ceid)
) # ceid includes a ':' character which must not be urlencoded
)
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
def response(resp):
"""Get response from google's search request"""
results = []
detect_google_sorry(resp)
# convert the text to dom
@ -152,8 +176,8 @@ def response(resp):
# The pub_date is mostly a string like 'yesertday', not a real
# timezone date or time. Therefore we can't use publishedDate.
pub_date = extract_text(eval_xpath(result, './article/div[1]/div[1]/time'))
pub_origin = extract_text(eval_xpath(result, './article/div[1]/div[1]/a'))
pub_date = extract_text(eval_xpath(result, './article//time'))
pub_origin = extract_text(eval_xpath(result, './article//a[@data-n-tid]'))
content = ' / '.join([x for x in [pub_origin, pub_date] if x])
@ -174,3 +198,127 @@ def response(resp):
# return results
return results
ceid_list = [
'AE:ar',
'AR:es-419',
'AT:de',
'AU:en',
'BD:bn',
'BE:fr',
'BE:nl',
'BG:bg',
'BR:pt-419',
'BW:en',
'CA:en',
'CA:fr',
'CH:de',
'CH:fr',
'CL:es-419',
'CN:zh-Hans',
'CO:es-419',
'CU:es-419',
'CZ:cs',
'DE:de',
'EG:ar',
'ES:es',
'ET:en',
'FR:fr',
'GB:en',
'GH:en',
'GR:el',
'HK:zh-Hant',
'HU:hu',
'ID:en',
'ID:id',
'IE:en',
'IL:en',
'IL:he',
'IN:bn',
'IN:en',
'IN:hi',
'IN:ml',
'IN:mr',
'IN:ta',
'IN:te',
'IT:it',
'JP:ja',
'KE:en',
'KR:ko',
'LB:ar',
'LT:lt',
'LV:en',
'LV:lv',
'MA:fr',
'MX:es-419',
'MY:en',
'NA:en',
'NG:en',
'NL:nl',
'NO:no',
'NZ:en',
'PE:es-419',
'PH:en',
'PK:en',
'PL:pl',
'PT:pt-150',
'RO:ro',
'RS:sr',
'RU:ru',
'SA:ar',
'SE:sv',
'SG:en',
'SI:sl',
'SK:sk',
'SN:fr',
'TH:th',
'TR:tr',
'TW:zh-Hant',
'TZ:en',
'UA:ru',
'UA:uk',
'UG:en',
'US:en',
'US:es-419',
'VE:es-419',
'VN:vi',
'ZA:en',
'ZW:en',
]
"""List of region/language combinations supported by Google News. Values of the
``ceid`` argument of the Google News REST API."""
_skip_values = [
'ET:en', # english (ethiopia)
'ID:en', # english (indonesia)
'LV:en', # english (latvia)
]
_ceid_locale_map = {'NO:no': 'nb-NO'}
def fetch_traits(engine_traits: EngineTraits):
_fetch_traits(engine_traits, add_domains=False)
engine_traits.custom['ceid'] = {}
for ceid in ceid_list:
if ceid in _skip_values:
continue
region, lang = ceid.split(':')
x = lang.split('-')
if len(x) > 1:
if x[1] not in ['Hant', 'Hans']:
lang = x[0]
sxng_locale = _ceid_locale_map.get(ceid, lang + '-' + region)
try:
locale = babel.Locale.parse(sxng_locale, sep='-')
except babel.UnknownLocaleError:
print("ERROR: %s -> %s is unknown by babel" % (ceid, sxng_locale))
continue
engine_traits.custom['ceid'][locales.region_tag(locale)] = ceid

View File

@ -1,19 +1,18 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Google (Scholar)
"""This is the implementation of the Google Scholar engine.
For detailed description of the *REST-full* API see: `Query Parameter
Definitions`_.
.. _Query Parameter Definitions:
https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
Compared to other Google services the Scholar engine has a simple GET REST-API
and there does not exists `async` API. Even though the API slightly vintage we
can make use of the :ref:`google API` to assemble the arguments of the GET
request.
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
from typing import Optional
from urllib.parse import urlencode
from datetime import datetime
from typing import Optional
from lxml import html
from searx.utils import (
@ -23,19 +22,21 @@ from searx.utils import (
extract_text,
)
from searx.exceptions import SearxEngineCaptchaException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_lang_info,
get_google_info,
time_range_dict,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
# pylint: disable=unused-import
from searx.engines.google import (
supported_languages_url,
_fetch_supported_languages,
)
if TYPE_CHECKING:
import logging
# pylint: enable=unused-import
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -51,53 +52,62 @@ about = {
categories = ['science', 'scientific publications']
paging = True
language_support = True
use_locale_domain = True
time_range_support = True
safesearch = False
send_accept_language_header = True
def time_range_url(params):
"""Returns a URL query component for a google-Scholar time range based on
``params['time_range']``. Google-Scholar does only support ranges in years.
To have any effect, all the Searx ranges (*day*, *week*, *month*, *year*)
are mapped to *year*. If no range is set, an empty string is returned.
Example::
def time_range_args(params):
"""Returns a dictionary with a time range arguments based on
``params['time_range']``.
Google Scholar supports a detailed search by year. Searching by *last
month* or *last week* (as offered by SearXNG) is uncommon for scientific
publications and is not supported by Google Scholar.
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned. Example; when
user selects a time range (current year minus one in 2022):
.. code:: python
{ 'as_ylo' : 2021 }
&as_ylo=2019
"""
# as_ylo=2016&as_yhi=2019
ret_val = ''
ret_val = {}
if params['time_range'] in time_range_dict:
ret_val = urlencode({'as_ylo': datetime.now().year - 1})
return '&' + ret_val
ret_val['as_ylo'] = datetime.now().year - 1
return ret_val
def detect_google_captcha(dom):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException()
def request(query, params):
"""Google-Scholar search request"""
offset = (params['pageno'] - 1) * 10
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
google_info['subdomain'] = google_info['subdomain'].replace("www.", "scholar.")
query_url = (
'https://'
+ lang_info['subdomain']
+ '/scholar'
+ "?"
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset})
)
args = {
'q': query,
**google_info['params'],
'start': (params['pageno'] - 1) * 10,
'as_sdt': '2007', # include patents / to disable set '0,5'
'as_vis': '0', # include citations / to disable set '1'
}
args.update(time_range_args(params))
query_url += time_range_url(params)
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
# params['google_subdomain'] = subdomain
params['url'] = 'https://' + google_info['subdomain'] + '/scholar?' + urlencode(args)
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
@ -138,19 +148,15 @@ def parse_gs_a(text: Optional[str]):
def response(resp): # pylint: disable=too-many-locals
"""Get response from google's search request"""
"""Parse response from Google Scholar"""
results = []
detect_google_sorry(resp)
# which subdomain ?
# subdomain = resp.search_params.get('google_subdomain')
# convert the text to dom
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, '//div[@data-cid]'):
for result in eval_xpath_list(dom, '//div[@data-rp]'):
title = extract_text(eval_xpath(result, './/h3[1]//a'))
@ -158,7 +164,7 @@ def response(resp): # pylint: disable=too-many-locals
# this is a [ZITATION] block
continue
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
if pub_type:
pub_type = pub_type[1:-1].lower()

View File

@ -1,6 +1,6 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""This is the implementation of the google videos engine.
"""This is the implementation of the Google Videos engine.
.. admonition:: Content-Security-Policy (CSP)
@ -14,9 +14,8 @@
"""
# pylint: disable=invalid-name
from typing import TYPE_CHECKING
import re
from urllib.parse import urlencode
from lxml import html
@ -27,20 +26,22 @@ from searx.utils import (
extract_text,
)
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_lang_info,
get_google_info,
time_range_dict,
filter_mapping,
g_section_with_header,
title_xpath,
suggestion_xpath,
detect_google_sorry,
)
from searx.enginelib.traits import EngineTraits
# pylint: disable=unused-import
from searx.engines.google import supported_languages_url, _fetch_supported_languages
if TYPE_CHECKING:
import logging
# pylint: enable=unused-import
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -55,70 +56,32 @@ about = {
# engine dependent config
categories = ['videos', 'web']
paging = False
paging = True
language_support = True
use_locale_domain = True
time_range_support = True
safesearch = True
send_accept_language_header = True
RE_CACHE = {}
def _re(regexpr):
"""returns compiled regular expression"""
RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr))
return RE_CACHE[regexpr]
def scrap_out_thumbs_src(dom):
ret_val = {}
thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'):
_script = script.text
# "dimg_35":"https://i.ytimg.c....",
_dimurl = _re("s='([^']*)").findall(_script)
for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script):
v = v.replace(r'\u003d', '=')
v = v.replace(r'\u0026', '&')
ret_val[k] = v
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
def scrap_out_thumbs(dom):
"""Scrap out thumbnail data from <script> tags."""
ret_val = {}
thumb_name = 'dimg_'
for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'):
_script = script.text
# var s='data:image/jpeg;base64, ...'
_imgdata = _re("s='([^']*)").findall(_script)
if not _imgdata:
continue
# var ii=['dimg_17']
for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script):
# At least the equal sign in the URL needs to be decoded
ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=")
logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys())
return ret_val
def request(query, params):
"""Google-Video search request"""
lang_info = get_lang_info(params, supported_languages, language_aliases, False)
google_info = get_google_info(params, traits)
query_url = (
'https://'
+ lang_info['subdomain']
+ google_info['subdomain']
+ '/search'
+ "?"
+ urlencode({'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8"})
+ urlencode(
{
'q': query,
'tbm': "vid",
'start': 10 * params['pageno'],
**google_info['params'],
'asearch': 'arc',
'async': 'use_ac:true,_fmt:html',
}
)
)
if params['time_range'] in time_range_dict:
@ -127,9 +90,8 @@ def request(query, params):
query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
params['url'] = query_url
params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['cookies'] = google_info['cookies']
params['headers'].update(google_info['headers'])
return params
@ -141,43 +103,30 @@ def response(resp):
# convert the text to dom
dom = html.fromstring(resp.text)
vidthumb_imgdata = scrap_out_thumbs(dom)
thumbs_src = scrap_out_thumbs_src(dom)
logger.debug(str(thumbs_src))
# parse results
for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'):
# ignore google *sections*
if extract_text(eval_xpath(result, g_section_with_header)):
logger.debug("ignoring <g-section-with-header>")
img_src = eval_xpath_getindex(result, './/img/@src', 0, None)
if img_src is None:
continue
# ingnore articles without an image id / e.g. news articles
img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None)
if img_id is None:
logger.error("no img_id found in item %s (news article?)", len(results) + 1)
continue
title = extract_text(eval_xpath_getindex(result, './/a/h3[1]', 0))
url = eval_xpath_getindex(result, './/a/h3[1]/../@href', 0)
img_src = vidthumb_imgdata.get(img_id, None)
if not img_src:
img_src = thumbs_src.get(img_id, "")
title = extract_text(eval_xpath_getindex(result, title_xpath, 0))
url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0)
length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span'))
c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0)
content = extract_text(c_node)
pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]'))
pub_info = extract_text(eval_xpath(result, './/div[@class="P7xzyf"]'))
length = extract_text(eval_xpath(result, './/div[@class="J1mWY"]'))
results.append(
{
'url': url,
'title': title,
'content': content,
'length': length,
'author': pub_info,
'thumbnail': img_src,
'length': length,
'template': 'videos.html',
}
)

View File

@ -1,18 +1,30 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
peertube (Videos)
# lint: pylint
"""Peertube and :py:obj:`SepiaSearch <searx.engines.sepiasearch>` do share
(more or less) the same REST API and the schema of the JSON result is identical.
"""
from json import loads
from datetime import datetime
import re
from urllib.parse import urlencode
from searx.utils import html_to_text
from datetime import datetime
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
import babel
from searx import network
from searx.locales import language_tag
from searx.utils import html_to_text
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
# pylint: disable=line-too-long
"website": 'https://joinpeertube.org',
"wikidata_id": 'Q50938515',
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html',
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
@ -22,66 +34,155 @@ about = {
categories = ["videos"]
paging = True
base_url = "https://peer.tube"
supported_languages_url = 'https://peer.tube/api/v1/videos/languages'
"""Base URL of the Peertube instance. A list of instances is available at:
- https://instances.joinpeertube.org/instances
"""
time_range_support = True
time_range_table = {
'day': relativedelta(),
'week': relativedelta(weeks=-1),
'month': relativedelta(months=-1),
'year': relativedelta(years=-1),
}
safesearch = True
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
def minute_to_hm(minute):
if isinstance(minute, int):
return "%d:%02d" % (divmod(minute, 60))
return None
# do search-request
def request(query, params):
sanitized_url = base_url.rstrip("/")
pageno = (params["pageno"] - 1) * 15
search_url = sanitized_url + "/api/v1/search/videos/?pageno={pageno}&{query}"
query_dict = {"search": query}
language = params["language"].split("-")[0]
if "all" != language and language in supported_languages:
query_dict["languageOneOf"] = language
params["url"] = search_url.format(query=urlencode(query_dict), pageno=pageno)
"""Assemble request for the Peertube API"""
if not query:
return False
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
eng_lang = traits.get_language(params['searxng_locale'], None)
params['url'] = (
base_url.rstrip("/")
+ "/api/v1/search/videos?"
+ urlencode(
{
'search': query,
'searchTarget': 'search-index', # Vidiversum
'resultType': 'videos',
'start': (params['pageno'] - 1) * 10,
'count': 10,
# -createdAt: sort by date ascending / createdAt: date descending
'sort': '-match', # sort by *match descending*
'nsfw': safesearch_table[params['safesearch']],
}
)
)
if eng_lang is not None:
params['url'] += '&languageOneOf[]=' + eng_lang
params['url'] += '&boostLanguages[]=' + eng_lang
if params['time_range'] in time_range_table:
time = datetime.now().date() + time_range_table[params['time_range']]
params['url'] += '&startDate=' + time.isoformat()
return params
def _get_offset_from_pageno(pageno):
return (pageno - 1) * 15 + 1
# get response from search-request
def response(resp):
sanitized_url = base_url.rstrip("/")
return video_response(resp)
def video_response(resp):
"""Parse video response from SepiaSearch and Peertube instances."""
results = []
search_res = loads(resp.text)
json_data = resp.json()
# return empty array if there are no results
if "data" not in search_res:
if 'data' not in json_data:
return []
# parse results
for res in search_res["data"]:
title = res["name"]
url = sanitized_url + "/videos/watch/" + res["uuid"]
description = res["description"]
if description:
content = html_to_text(res["description"])
else:
content = ""
thumbnail = sanitized_url + res["thumbnailPath"]
publishedDate = datetime.strptime(res["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
for result in json_data['data']:
metadata = [
x
for x in [
result.get('channel', {}).get('displayName'),
result.get('channel', {}).get('name') + '@' + result.get('channel', {}).get('host'),
', '.join(result.get('tags', [])),
]
if x
]
results.append(
{
"template": "videos.html",
"url": url,
"title": title,
"content": content,
"publishedDate": publishedDate,
"iframe_src": sanitized_url + res["embedPath"],
"thumbnail": thumbnail,
'url': result['url'],
'title': result['name'],
'content': html_to_text(result.get('description') or ''),
'author': result.get('account', {}).get('displayName'),
'length': minute_to_hm(result.get('duration')),
'template': 'videos.html',
'publishedDate': parse(result['publishedAt']),
'iframe_src': result.get('embedUrl'),
'thumbnail': result.get('thumbnailUrl') or result.get('previewUrl'),
'metadata': ' | '.join(metadata),
}
)
# return results
return results
def _fetch_supported_languages(resp):
videolanguages = resp.json()
peertube_languages = list(videolanguages.keys())
return peertube_languages
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from peertube's search-index source code.
See videoLanguages_ in commit `8ed5c729 - Refactor and redesign client`_
.. _8ed5c729 - Refactor and redesign client:
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729
.. _videoLanguages:
https://framagit.org/framasoft/peertube/search-index/-/commit/8ed5c729#3d8747f9a60695c367c70bb64efba8f403721fad_0_291
"""
resp = network.get(
'https://framagit.org/framasoft/peertube/search-index/-/raw/master/client/src/components/Filters.vue',
# the response from search-index repository is very slow
timeout=60,
)
if not resp.ok:
print("ERROR: response from peertube is not OK.")
return
js_lang = re.search(r"videoLanguages \(\)[^\n]+(.*?)\]", resp.text, re.DOTALL)
if not js_lang:
print("ERROR: can't determine languages from peertube")
return
for lang in re.finditer(r"\{ id: '([a-z]+)', label:", js_lang.group(1)):
try:
eng_tag = lang.group(1)
if eng_tag == 'oc':
# Occitanis not known by babel, its closest relative is Catalan
# but 'ca' is already in the list of engine_traits.languages -->
# 'oc' will be ignored.
continue
sxng_tag = language_tag(babel.Locale.parse(eng_tag))
except babel.UnknownLocaleError:
print("ERROR: %s is unknown by babel" % eng_tag)
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.languages['zh_Hans'] = 'zh'
engine_traits.languages['zh_Hant'] = 'zh'

View File

@ -34,7 +34,9 @@ import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.locales import get_engine_locale
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
@ -49,7 +51,6 @@ about = {
# engine dependent config
categories = []
paging = True
supported_languages_url = about['website']
qwant_categ = None # web|news|inages|videos
safesearch = True
@ -95,7 +96,7 @@ def request(query, params):
)
# add quant's locale
q_locale = get_engine_locale(params['language'], supported_languages, default='en_US')
q_locale = traits.get_region(params["searxng_locale"], default='en_US')
params['url'] += '&locale=' + q_locale
# add safesearch option
@ -243,15 +244,20 @@ def response(resp):
return results
def _fetch_supported_languages(resp):
def fetch_traits(engine_traits: EngineTraits):
# pylint: disable=import-outside-toplevel
from searx import network
from searx.locales import region_tag
resp = network.get(about['website'])
text = resp.text
text = text[text.find('INITIAL_PROPS') :]
text = text[text.find('{') : text.find('</script>')]
q_initial_props = loads(text)
q_locales = q_initial_props.get('locales')
q_valid_locales = []
eng_tag_list = set()
for country, v in q_locales.items():
for lang in v['langs']:
@ -261,25 +267,18 @@ def _fetch_supported_languages(resp):
# qwant-news does not support all locales from qwant-web:
continue
q_valid_locales.append(_locale)
eng_tag_list.add(_locale)
supported_languages = {}
for q_locale in q_valid_locales:
for eng_tag in eng_tag_list:
try:
locale = babel.Locale.parse(q_locale, sep='_')
except babel.core.UnknownLocaleError:
print("ERROR: can't determine babel locale of quant's locale %s" % q_locale)
sxng_tag = region_tag(babel.Locale.parse(eng_tag, sep='_'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of quant's locale %s" % eng_tag)
continue
# note: supported_languages (dict)
#
# dict's key is a string build up from a babel.Locale object / the
# notation 'xx-XX' (and 'xx') conforms to SearXNG's locale (and
# language) notation and dict's values are the locale strings used by
# the engine.
searxng_locale = locale.language + '-' + locale.territory # --> params['language']
supported_languages[searxng_locale] = q_locale
return supported_languages
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag

View File

@ -1,70 +1,80 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
SepiaSearch (Videos)
# lint: pylint
"""SepiaSearch uses the same languages as :py:obj:`Peertube
<searx.engines.peertube>` and the response is identical to the response from the
peertube engines.
"""
from json import loads
from dateutil import parser, relativedelta
from typing import TYPE_CHECKING
from urllib.parse import urlencode
from datetime import datetime
# about
from searx.engines.peertube import fetch_traits # pylint: disable=unused-import
from searx.engines.peertube import (
# pylint: disable=unused-import
video_response,
safesearch_table,
time_range_table,
)
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
about = {
# pylint: disable=line-too-long
"website": 'https://sepiasearch.org',
"wikidata_id": None,
"official_api_documentation": "https://framagit.org/framasoft/peertube/search-index/-/tree/master/server/controllers/api", # NOQA
"official_api_documentation": 'https://docs.joinpeertube.org/api-rest-reference.html#tag/Search/operation/searchVideos',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['videos']
paging = True
base_url = 'https://sepiasearch.org'
time_range_support = True
safesearch = True
supported_languages = [
# fmt: off
'en', 'fr', 'ja', 'eu', 'ca', 'cs', 'eo', 'el',
'de', 'it', 'nl', 'es', 'oc', 'gd', 'zh', 'pt',
'sv', 'pl', 'fi', 'ru'
# fmt: on
]
base_url = 'https://sepiasearch.org/api/v1/search/videos'
safesearch_table = {0: 'both', 1: 'false', 2: 'false'}
time_range_table = {
'day': relativedelta.relativedelta(),
'week': relativedelta.relativedelta(weeks=-1),
'month': relativedelta.relativedelta(months=-1),
'year': relativedelta.relativedelta(years=-1),
}
def minute_to_hm(minute):
if isinstance(minute, int):
return "%d:%02d" % (divmod(minute, 60))
return None
def request(query, params):
"""Assemble request for the SepiaSearch API"""
if not query:
return False
# eng_region = traits.get_region(params['searxng_locale'], 'en_US')
eng_lang = traits.get_language(params['searxng_locale'], None)
params['url'] = (
base_url
+ '?'
base_url.rstrip("/")
+ "/api/v1/search/videos?"
+ urlencode(
{
'search': query,
'start': (params['pageno'] - 1) * 10,
'count': 10,
'sort': '-match',
# -createdAt: sort by date ascending / createdAt: date descending
'sort': '-match', # sort by *match descending*
'nsfw': safesearch_table[params['safesearch']],
}
)
)
language = params['language'].split('-')[0]
if language in supported_languages:
params['url'] += '&languageOneOf[]=' + language
if eng_lang is not None:
params['url'] += '&languageOneOf[]=' + eng_lang
params['url'] += '&boostLanguages[]=' + eng_lang
if params['time_range'] in time_range_table:
time = datetime.now().date() + time_range_table[params['time_range']]
params['url'] += '&startDate=' + time.isoformat()
@ -73,34 +83,4 @@ def request(query, params):
def response(resp):
results = []
search_results = loads(resp.text)
if 'data' not in search_results:
return []
for result in search_results['data']:
title = result['name']
content = result['description']
thumbnail = result['thumbnailUrl']
publishedDate = parser.parse(result['publishedAt'])
author = result.get('account', {}).get('displayName')
length = minute_to_hm(result.get('duration'))
url = result['url']
results.append(
{
'url': url,
'title': title,
'content': content,
'author': author,
'length': length,
'template': 'videos.html',
'publishedDate': publishedDate,
'iframe_src': result.get('embedUrl'),
'thumbnail': thumbnail,
}
)
return results
return video_response(resp)

View File

@ -1,28 +1,108 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Startpage (Web)
"""Startpage's language & region selectors are a mess ..
.. _startpage regions:
Startpage regions
=================
In the list of regions there are tags we need to map to common region tags::
pt-BR_BR --> pt_BR
zh-CN_CN --> zh_Hans_CN
zh-TW_TW --> zh_Hant_TW
zh-TW_HK --> zh_Hant_HK
en-GB_GB --> en_GB
and there is at least one tag with a three letter language tag (ISO 639-2)::
fil_PH --> fil_PH
The locale code ``no_NO`` from Startpage does not exists and is mapped to
``nb-NO``::
babel.core.UnknownLocaleError: unknown locale 'no_NO'
For reference see languages-subtag at iana; ``no`` is the macrolanguage [1]_ and
W3C recommends subtag over macrolanguage [2]_.
.. [1] `iana: language-subtag-registry
<https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry>`_ ::
type: language
Subtag: nb
Description: Norwegian Bokmål
Added: 2005-10-16
Suppress-Script: Latn
Macrolanguage: no
.. [2]
Use macrolanguages with care. Some language subtags have a Scope field set to
macrolanguage, i.e. this primary language subtag encompasses a number of more
specific primary language subtags in the registry. ... As we recommended for
the collection subtags mentioned above, in most cases you should try to use
the more specific subtags ... `W3: The primary language subtag
<https://www.w3.org/International/questions/qa-choosing-language-tags#langsubtag>`_
.. _startpage languages:
Startpage languages
===================
:py:obj:`send_accept_language_header`:
The displayed name in Startpage's settings page depend on the location of the
IP when ``Accept-Language`` HTTP header is unset. In :py:obj:`fetch_traits`
we use::
'Accept-Language': "en-US,en;q=0.5",
..
to get uniform names independent from the IP).
.. _startpage categories:
Startpage categories
====================
Startpage's category (for Web-search, News, Videos, ..) is set by
:py:obj:`startpage_categ` in settings.yml::
- name: startpage
engine: startpage
startpage_categ: web
...
.. hint::
The default category is ``web`` .. and other categories than ``web`` are not
yet implemented.
"""
from typing import TYPE_CHECKING
from collections import OrderedDict
import re
from time import time
from urllib.parse import urlencode
from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta
from dateutil import parser
from lxml import html
from babel import Locale
from babel.localedata import locale_identifiers
import dateutil.parser
import lxml.html
import babel
from searx.network import get
from searx.utils import extract_text, eval_xpath, match_language
from searx.exceptions import (
SearxEngineResponseException,
SearxEngineCaptchaException,
)
from searx import network
from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -34,18 +114,28 @@ about = {
"results": 'HTML',
}
startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`.
"""
send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``. Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""
# engine dependent config
categories = ['general', 'web']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls
paging = True
supported_languages_url = 'https://www.startpage.com/do/settings'
time_range_support = True
safesearch = True
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
safesearch_dict = {0: '0', 1: '1', 2: '1'}
# search-url
base_url = 'https://startpage.com/'
search_url = base_url + 'sp/search?'
base_url = 'https://www.startpage.com'
search_url = base_url + '/sp/search'
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
@ -53,92 +143,193 @@ search_url = base_url + 'sp/search?'
results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form
.. code: html
<form action="/sp/search" method="post">
<input type="text" name="query" value="" ..>
<input type="hidden" name="t" value="device">
<input type="hidden" name="lui" value="english">
<input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
<input type="hidden" name="cat" value="web">
<input type="hidden" class="abp" id="abp-input" name="abp" value="1">
</form>
"""
# timestamp of the last fetch of 'sc' code
sc_code_ts = 0
sc_code = ''
sc_code_cache_sec = 30
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""
def raise_captcha(resp):
def get_sc_code(searxng_locale, params):
"""Get an actual ``sc`` argument from Startpage's search form (HTML page).
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
raise SearxEngineCaptchaException()
Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
<search_form_xpath>`. Without this argument Startpage considers the request
is from a bot. We do not know what is encoded in the value of the ``sc``
argument, but it seems to be a kind of a *time-stamp*.
def get_sc_code(headers):
"""Get an actual `sc` argument from startpage's home page.
Startpage puts a `sc` argument on every link. Without this argument
startpage considers the request is from a bot. We do not know what is
encoded in the value of the `sc` argument, but it seems to be a kind of a
*time-stamp*. This *time-stamp* is valid for a few hours.
This function scrap a new *time-stamp* from startpage's home page every hour
(3000 sec).
Startpage's search form generates a new sc-code on each request. This
function scrap a new sc-code from Startpage's home page every
:py:obj:`sc_code_cache_sec` seconds.
"""
global sc_code_ts, sc_code # pylint: disable=global-statement
if time() > (sc_code_ts + 3000):
logger.debug("query new sc time-stamp ...")
if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
logger.debug("get_sc_code: reuse '%s'", sc_code)
return sc_code
resp = get(base_url, headers=headers)
raise_captcha(resp)
dom = html.fromstring(resp.text)
headers = {**params['headers']}
headers['Origin'] = base_url
headers['Referer'] = base_url + '/'
# headers['Connection'] = 'keep-alive'
# headers['Accept-Encoding'] = 'gzip, deflate, br'
# headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
# headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'
# add Accept-Language header
if searxng_locale == 'all':
searxng_locale = 'en-US'
locale = babel.Locale.parse(searxng_locale, sep='-')
if send_accept_language_header:
ac_lang = locale.language
if locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
locale.language,
locale.territory,
locale.language,
)
headers['Accept-Language'] = ac_lang
get_sc_url = base_url + '/?sc=%s' % (sc_code)
logger.debug("query new sc time-stamp ... %s", get_sc_url)
logger.debug("headers: %s", headers)
resp = network.get(get_sc_url, headers=headers)
# ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
# ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
# ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21
if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
raise SearxEngineCaptchaException(
message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
)
dom = lxml.html.fromstring(resp.text)
try:
# <input type="hidden" name="sc" value="...">
sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
except IndexError as exc:
# suspend startpage API --> https://github.com/searxng/searxng/pull/695
raise SearxEngineResponseException(
suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"
logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
raise SearxEngineCaptchaException(
message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
) from exc
sc_code_ts = time()
logger.debug("new value is: %s", sc_code)
logger.debug("get_sc_code: new value is: %s", sc_code)
return sc_code
# do search-request
def request(query, params):
"""Assemble a Startpage request.
# pylint: disable=line-too-long
# The format string from Startpage's FFox add-on [1]::
#
# https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
#
# [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/
To avoid CAPTCHA we need to send a well formed HTTP POST request with a
cookie. We need to form a request that is identical to the request build by
Startpage's search form:
- in the cookie the **region** is selected
- in the HTTP POST data the **language** is selected
Additionally the arguments form Startpage's search form needs to be set in
HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
"""
if startpage_categ == 'web':
return _request_cat_web(query, params)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return params
def _request_cat_web(query, params):
engine_region = traits.get_region(params['searxng_locale'], 'en-US')
engine_language = traits.get_language(params['searxng_locale'], 'en')
# build arguments
args = {
'query': query,
'page': params['pageno'],
'cat': 'web',
# 'pl': 'ext-ff',
# 'extVersion': '1.3.0',
# 'abp': "-1",
'sc': get_sc_code(params['headers']),
't': 'device',
'sc': get_sc_code(params['searxng_locale'], params), # hint: this func needs HTTP headers,
'with_date': time_range_dict.get(params['time_range'], ''),
}
# set language if specified
if params['language'] != 'all':
lang_code = match_language(params['language'], supported_languages, fallback=None)
if lang_code:
language_name = supported_languages[lang_code]['alias']
args['language'] = language_name
args['lui'] = language_name
if engine_language:
args['language'] = engine_language
args['lui'] = engine_language
args['abp'] = '1'
if params['pageno'] > 1:
args['page'] = params['pageno']
# build cookie
lang_homepage = 'en'
cookie = OrderedDict()
cookie['date_time'] = 'world'
cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
cookie['disable_open_in_new_window'] = '0'
cookie['enable_post_method'] = '1' # hint: POST
cookie['enable_proxy_safety_suggest'] = '1'
cookie['enable_stay_control'] = '1'
cookie['instant_answers'] = '1'
cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
cookie['num_of_results'] = '10'
cookie['suggestions'] = '1'
cookie['wt_unit'] = 'celsius'
if engine_language:
cookie['language'] = engine_language
cookie['language_ui'] = engine_language
if engine_region:
cookie['search_results_region'] = engine_region
params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
logger.debug('cookie preferences: %s', params['cookies']['preferences'])
# POST request
logger.debug("data: %s", args)
params['data'] = args
params['method'] = 'POST'
params['url'] = search_url
params['headers']['Origin'] = base_url
params['headers']['Referer'] = base_url + '/'
# is the Accept header needed?
# params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
params['url'] = search_url + urlencode(args)
return params
# get response from search-request
def response(resp):
results = []
dom = lxml.html.fromstring(resp.text)
dom = html.fromstring(resp.text)
if startpage_categ == 'web':
return _response_cat_web(dom)
logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
return []
def _response_cat_web(dom):
results = []
# parse results
for result in eval_xpath(dom, results_xpath):
@ -173,7 +364,7 @@ def response(resp):
content = content[date_pos:]
try:
published_date = parser.parse(date_string, dayfirst=True)
published_date = dateutil.parser.parse(date_string, dayfirst=True)
except ValueError:
pass
@ -199,62 +390,103 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
# startpage's language selector is a mess each option has a displayed name
# and a value, either of which may represent the language name in the native
# script, the language name in English, an English transliteration of the
# native name, the English name of the writing script used by the language,
# or occasionally something else entirely.
def fetch_traits(engine_traits: EngineTraits):
"""Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
regions>` from Startpage."""
# pylint: disable=too-many-branches
# this cases are so special they need to be hardcoded, a couple of them are misspellings
language_names = {
'english_uk': 'en-GB',
'fantizhengwen': ['zh-TW', 'zh-HK'],
'hangul': 'ko',
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
'sudanese': 'su',
headers = {
'User-Agent': gen_useragent(),
'Accept-Language': "en-US,en;q=0.5", # bing needs to set the English language
}
resp = network.get('https://www.startpage.com/do/settings', headers=headers)
# get the English name of every language known by babel
language_names.update(
{
# fmt: off
name.lower(): lang_code
# pylint: disable=protected-access
for lang_code, name in Locale('en')._data['languages'].items()
# fmt: on
}
)
if not resp.ok:
print("ERROR: response from Startpage is not OK.")
dom = lxml.html.fromstring(resp.text)
# regions
sp_region_names = []
for option in dom.xpath('//form[@name="settings"]//select[@name="search_results_region"]/option'):
sp_region_names.append(option.get('value'))
for eng_tag in sp_region_names:
if eng_tag == 'all':
continue
babel_region_tag = {'no_NO': 'nb_NO'}.get(eng_tag, eng_tag) # norway
if '-' in babel_region_tag:
l, r = babel_region_tag.split('-')
r = r.split('_')[-1]
sxng_tag = region_tag(babel.Locale.parse(l + '_' + r, sep='_'))
else:
try:
sxng_tag = region_tag(babel.Locale.parse(babel_region_tag, sep='_'))
except babel.UnknownLocaleError:
print("ERROR: can't determine babel locale of startpage's locale %s" % eng_tag)
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.regions[sxng_tag] = eng_tag
# languages
catalog_engine2code = {name.lower(): lang_code for lang_code, name in babel.Locale('en').languages.items()}
# get the native name of every language known by babel
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
native_name = Locale(lang_code).get_language_name().lower()
for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, babel.localedata.locale_identifiers()):
native_name = babel.Locale(lang_code).get_language_name().lower()
# add native name exactly as it is
language_names[native_name] = lang_code
catalog_engine2code[native_name] = lang_code
# add "normalized" language name (i.e. français becomes francais and español becomes espanol)
unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
if len(unaccented_name) == len(unaccented_name.encode()):
# add only if result is ascii (otherwise "normalization" didn't work)
language_names[unaccented_name] = lang_code
catalog_engine2code[unaccented_name] = lang_code
# values that can't be determined by babel's languages names
catalog_engine2code.update(
{
# traditional chinese used in ..
'fantizhengwen': 'zh_Hant',
# Korean alphabet
'hangul': 'ko',
# Malayalam is one of 22 scheduled languages of India.
'malayam': 'ml',
'norsk': 'nb',
'sinhalese': 'si',
}
)
skip_eng_tags = {
'english_uk', # SearXNG lang 'en' already maps to 'english'
}
dom = html.fromstring(resp.text)
sp_lang_names = []
for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
sp_lang_names.append((option.get('value'), extract_text(option).lower()))
supported_languages = {}
for sp_option_value, sp_option_text in sp_lang_names:
lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
if isinstance(lang_code, str):
supported_languages[lang_code] = {'alias': sp_option_value}
elif isinstance(lang_code, list):
for _lc in lang_code:
supported_languages[_lc] = {'alias': sp_option_value}
else:
print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))
eng_tag = option.get('value')
if eng_tag in skip_eng_tags:
continue
name = extract_text(option).lower()
return supported_languages
sxng_tag = catalog_engine2code.get(eng_tag)
if sxng_tag is None:
sxng_tag = catalog_engine2code[name]
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag

View File

@ -1,9 +1,12 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Wikidata
"""This module implements the Wikidata engine. Some implementations are shared
from :ref:`wikipedia engine`.
"""
# pylint: disable=missing-class-docstring
from typing import TYPE_CHECKING
from hashlib import md5
from urllib.parse import urlencode, unquote
from json import loads
@ -13,12 +16,17 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
from searx.data import WIKIDATA_UNITS
from searx.network import post, get
from searx.utils import match_language, searx_useragent, get_string_replaces_function
from searx.utils import searx_useragent, get_string_replaces_function
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
from searx.engines.wikipedia import ( # pylint: disable=unused-import
_fetch_supported_languages,
supported_languages_url,
)
from searx.engines.wikipedia import fetch_traits as _fetch_traits
from searx.enginelib.traits import EngineTraits
if TYPE_CHECKING:
import logging
logger: logging.Logger
traits: EngineTraits
# about
about = {
@ -154,33 +162,35 @@ def send_wikidata_query(query, method='GET'):
def request(query, params):
language = params['language'].split('-')[0]
if language == 'all':
language = 'en'
else:
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
# wikidata does not support zh-classical (zh_Hans) / zh-TW, zh-HK and zh-CN
# mapped to zh
sxng_lang = params['searxng_locale'].split('-')[0]
language = traits.get_language(sxng_lang, 'en')
query, attributes = get_query(query, language)
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
params['method'] = 'POST'
params['url'] = SPARQL_ENDPOINT_URL
params['data'] = {'query': query}
params['headers'] = get_headers()
params['language'] = language
params['attributes'] = attributes
return params
def response(resp):
results = []
jsonresponse = loads(resp.content.decode())
language = resp.search_params['language'].lower()
language = resp.search_params['language']
attributes = resp.search_params['attributes']
logger.debug("request --> language %s // len(attributes): %s", language, len(attributes))
seen_entities = set()
for result in jsonresponse.get('results', {}).get('bindings', []):
attribute_result = {key: value['value'] for key, value in result.items()}
entity_url = attribute_result['item']
@ -756,3 +766,15 @@ def init(engine_settings=None): # pylint: disable=unused-argument
lang = result['name']['xml:lang']
entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
def fetch_traits(engine_traits: EngineTraits):
"""Use languages evaluated from :py:obj:`wikipedia.fetch_traits
<searx.engines.wikipedia.fetch_traits>` except zh-classical (zh_Hans) what
is not supported by wikidata."""
_fetch_traits(engine_traits)
# wikidata does not support zh-classical (zh_Hans)
engine_traits.languages.pop('zh_Hans')
# wikidata does not have net-locations for the languages
engine_traits.custom['wiki_netloc'] = {}

View File

@ -1,13 +1,26 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Wikipedia (Web)
# lint: pylint
"""This module implements the Wikipedia engine. Some of this implementations
are shared by other engines:
- :ref:`wikidata engine`
The list of supported languages is fetched from the article linked by
:py:obj:`wikipedia_article_depth`. Unlike traditional search engines, wikipedia
does not support one Wikipedia for all the languages, but there is one Wikipedia
for every language (:py:obj:`fetch_traits`).
"""
from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from searx.utils import match_language, searx_useragent
from searx.network import raise_for_httperror
import urllib.parse
import babel
from lxml import html
from searx import network
from searx.locales import language_tag
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
@ -19,32 +32,40 @@ about = {
"results": 'JSON',
}
send_accept_language_header = True
# search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")}
wikipedia_article_depth = 'https://meta.wikimedia.org/wiki/Wikipedia_article_depth'
"""The *editing depth* of Wikipedia is one of several possible rough indicators
of the encyclopedia's collaborative quality, showing how frequently its articles
are updated. The measurement of depth was introduced after some limitations of
the classic measurement of article count were realized.
"""
# example: https://zh-classical.wikipedia.org/api/rest_v1/page/summary/日
rest_v1_summary_url = 'https://{wiki_netloc}/api/rest_v1/page/summary/{title}'
"""`wikipedia rest_v1 summary API`_: The summary response includes an extract of
the first paragraph of the page in plain text and HTML as well as the type of
page. This is useful for page previews (fka. Hovercards, aka. Popups) on the web
and link previews in the apps.
.. _wikipedia rest_v1 summary API: https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_
"""
# set language in base_url
def url_lang(lang):
lang_pre = lang.split('-')[0]
if lang_pre == 'all' or lang_pre not in supported_languages and lang_pre not in language_aliases:
return 'en'
return match_language(lang, supported_languages, language_aliases).split('-')[0]
# do search-request
def request(query, params):
"""Assemble a request (`wikipedia rest_v1 summary API`_)."""
if query.islower():
query = query.title()
language = url_lang(params['language'])
params['url'] = search_url.format(title=quote(query), language=language)
engine_language = traits.get_language(params['searxng_locale'], 'en')
wiki_netloc = traits.custom['wiki_netloc'].get(engine_language, 'https://en.wikipedia.org/wiki/')
title = urllib.parse.quote(query)
# '!wikipedia 日 :zh-TW' --> https://zh-classical.wikipedia.org/
# '!wikipedia 日 :zh' --> https://zh.wikipedia.org/
params['url'] = rest_v1_summary_url.format(wiki_netloc=wiki_netloc, title=title)
params['headers']['User-Agent'] = searx_useragent()
params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2
@ -53,13 +74,14 @@ def request(query, params):
# get response from search-request
def response(resp):
results = []
if resp.status_code == 404:
return []
if resp.status_code == 400:
try:
api_result = loads(resp.text)
except:
api_result = resp.json()
except Exception: # pylint: disable=broad-except
pass
else:
if (
@ -68,20 +90,14 @@ def response(resp):
):
return []
raise_for_httperror(resp)
results = []
api_result = loads(resp.text)
# skip disambiguation pages
if api_result.get('type') != 'standard':
return []
network.raise_for_httperror(resp)
api_result = resp.json()
title = api_result['title']
wikipedia_link = api_result['content_urls']['desktop']['page']
results.append({'url': wikipedia_link, 'title': title, 'content': api_result.get('description', '')})
results.append({'url': wikipedia_link, 'title': title})
if api_result.get('type') == 'standard':
results.append(
{
'infobox': title,
@ -95,22 +111,114 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = {}
dom = fromstring(resp.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[1].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a')[0].text.replace(',', ''))
# exclude languages with too few articles
if articles >= 100:
supported_languages[code] = {"name": name, "english_name": english_name}
# Nonstandard language codes
#
# These Wikipedias use language codes that do not conform to the ISO 639
# standard (which is how wiki subdomains are chosen nowadays).
return supported_languages
lang_map = {
'be-tarask': 'bel',
'ak': 'aka',
'als': 'gsw',
'bat-smg': 'sgs',
'cbk-zam': 'cbk',
'fiu-vro': 'vro',
'map-bms': 'map',
'nrm': 'nrf',
'roa-rup': 'rup',
'nds-nl': 'nds',
#'simple: invented code used for the Simple English Wikipedia (not the official IETF code en-simple)
'zh-min-nan': 'nan',
'zh-yue': 'yue',
'an': 'arg',
'zh-classical': 'zh-Hant', # babel maps classical to zh-Hans (for whatever reason)
}
unknown_langs = [
'an', # Aragonese
'ba', # Bashkir
'bar', # Bavarian
'bcl', # Central Bicolano
'be-tarask', # Belarusian variant / Belarusian is already covered by 'be'
'bpy', # Bishnupriya Manipuri is unknown by babel
'hif', # Fiji Hindi
'ilo', # Ilokano
'li', # Limburgish
'sco', # Scots (sco) is not known by babel, Scottish Gaelic (gd) is known by babel
'sh', # Serbo-Croatian
'simple', # simple english is not know as a natural language different to english (babel)
'vo', # Volapük
'wa', # Walloon
]
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from Wikipedia.
The location of the Wikipedia address of a language is mapped in a
:py:obj:`custom field <searx.enginelib.traits.EngineTraits.custom>`
(``wiki_netloc``). Here is a reduced example:
.. code:: python
traits.custom['wiki_netloc'] = {
"en": "en.wikipedia.org",
..
"gsw": "als.wikipedia.org",
..
"zh": "zh.wikipedia.org",
"zh-classical": "zh-classical.wikipedia.org"
}
"""
engine_traits.custom['wiki_netloc'] = {}
# insert alias to map from a region like zh-CN to a language zh_Hans
engine_traits.languages['zh_Hans'] = 'zh'
resp = network.get(wikipedia_article_depth)
if not resp.ok:
print("ERROR: response from Wikipedia is not OK.")
dom = html.fromstring(resp.text)
for row in dom.xpath('//table[contains(@class,"sortable")]//tbody/tr'):
cols = row.xpath('./td')
if not cols:
continue
cols = [c.text_content().strip() for c in cols]
depth = float(cols[3].replace('-', '0').replace(',', ''))
articles = int(cols[4].replace(',', '').replace(',', ''))
if articles < 10000:
# exclude languages with too few articles
continue
if int(depth) < 20:
# Rough indicator of a Wikipedias quality, showing how frequently
# its articles are updated.
continue
eng_tag = cols[2]
wiki_url = row.xpath('./td[3]/a/@href')[0]
wiki_url = urllib.parse.urlparse(wiki_url)
if eng_tag in unknown_langs:
continue
try:
sxng_tag = language_tag(babel.Locale.parse(lang_map.get(eng_tag, eng_tag), sep='-'))
except babel.UnknownLocaleError:
print("ERROR: %s [%s] is unknown by babel" % (cols[0], eng_tag))
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag
engine_traits.custom['wiki_netloc'][eng_tag] = wiki_url.netloc

View File

@ -17,8 +17,10 @@ from searx.utils import (
eval_xpath_getindex,
eval_xpath_list,
extract_text,
match_language,
)
from searx.enginelib.traits import EngineTraits
traits: EngineTraits
# about
about = {
@ -34,8 +36,7 @@ about = {
categories = ['general', 'web']
paging = True
time_range_support = True
supported_languages_url = 'https://search.yahoo.com/preferences/languages'
"""Supported languages are read from Yahoo preference page."""
# send_accept_language_header = True
time_range_dict = {
'day': ('1d', 'd'),
@ -43,15 +44,10 @@ time_range_dict = {
'month': ('1m', 'm'),
}
language_aliases = {
'zh-HK': 'zh_chs',
'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com
'zh-TW': 'zh_cht',
}
lang2domain = {
'zh_chs': 'hk.search.yahoo.com',
'zh_cht': 'tw.search.yahoo.com',
'any': 'search.yahoo.com',
'en': 'search.yahoo.com',
'bg': 'search.yahoo.com',
'cs': 'search.yahoo.com',
@ -67,21 +63,23 @@ lang2domain = {
}
"""Map language to domain"""
def _get_language(params):
lang = language_aliases.get(params['language'])
if lang is None:
lang = match_language(params['language'], supported_languages, language_aliases)
lang = lang.split('-')[0]
logger.debug("params['language']: %s --> %s", params['language'], lang)
return lang
locale_aliases = {
'zh': 'zh_Hans',
'zh-HK': 'zh_Hans',
'zh-CN': 'zh_Hans', # dead since 2015 / routed to hk.search.yahoo.com
'zh-TW': 'zh_Hant',
}
def request(query, params):
"""build request"""
lang = locale_aliases.get(params['language'], None)
if not lang:
lang = params['language'].split('-')[0]
lang = traits.get_language(lang, traits.all_locale)
offset = (params['pageno'] - 1) * 7 + 1
lang = _get_language(params)
age, btf = time_range_dict.get(params['time_range'], ('', ''))
args = urlencode(
@ -154,13 +152,37 @@ def response(resp):
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = []
def fetch_traits(engine_traits: EngineTraits):
"""Fetch languages from yahoo"""
# pylint: disable=import-outside-toplevel
import babel
from searx import network
from searx.locales import language_tag
engine_traits.all_locale = 'any'
resp = network.get('https://search.yahoo.com/preferences/languages')
if not resp.ok:
print("ERROR: response from peertube is not OK.")
dom = html.fromstring(resp.text)
offset = len('lang_')
for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'):
supported_languages.append(val[offset:])
eng2sxng = {'zh_chs': 'zh_Hans', 'zh_cht': 'zh_Hant'}
return supported_languages
for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'):
eng_tag = val[offset:]
try:
sxng_tag = language_tag(babel.Locale.parse(eng2sxng.get(eng_tag, eng_tag)))
except babel.UnknownLocaleError:
print('ERROR: unknown language --> %s' % eng_tag)
continue
conflict = engine_traits.languages.get(sxng_tag)
if conflict:
if conflict != eng_tag:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_tag))
continue
engine_traits.languages[sxng_tag] = eng_tag

View File

@ -4,11 +4,11 @@
"""Initialize :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`.
"""
from typing import Set
from typing import Set, Optional, List
import os
import pathlib
from babel import Locale
import babel
from babel.support import Translations
import babel.languages
import babel.core
@ -134,7 +134,7 @@ def locales_initialize(directory=None):
flask_babel.get_translations = get_translations
for tag, descr in ADDITIONAL_TRANSLATIONS.items():
locale = Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
locale = babel.Locale.parse(LOCALE_BEST_MATCH[tag], sep='-')
LOCALE_NAMES[tag] = descr
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
@ -142,7 +142,7 @@ def locales_initialize(directory=None):
for tag in LOCALE_BEST_MATCH:
descr = LOCALE_NAMES.get(tag)
if not descr:
locale = Locale.parse(tag, sep='-')
locale = babel.Locale.parse(tag, sep='-')
LOCALE_NAMES[tag] = get_locale_descr(locale, tag.replace('-', '_'))
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
@ -154,12 +154,77 @@ def locales_initialize(directory=None):
tag = dirname.replace('_', '-')
descr = LOCALE_NAMES.get(tag)
if not descr:
locale = Locale.parse(dirname)
locale = babel.Locale.parse(dirname)
LOCALE_NAMES[tag] = get_locale_descr(locale, dirname)
if locale.text_direction == 'rtl':
RTL_LOCALES.add(tag)
def region_tag(locale: babel.Locale) -> str:
"""Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US)."""
if not locale.territory:
raise ValueError('%s missed a territory')
return locale.language + '-' + locale.territory
def language_tag(locale: babel.Locale) -> str:
"""Returns SearXNG's language tag from the locale and if exits, the tag
includes the script name (e.g. en, zh_Hant).
"""
sxng_lang = locale.language
if locale.script:
sxng_lang += '_' + locale.script
return sxng_lang
def get_locale(locale_tag: str) -> Optional[babel.Locale]:
"""Returns a :py:obj:`babel.Locale` object parsed from argument
``locale_tag``"""
try:
locale = babel.Locale.parse(locale_tag, sep='-')
return locale
except babel.core.UnknownLocaleError:
return None
def get_offical_locales(
territory: str, languages=None, regional: bool = False, de_facto: bool = True
) -> Set[babel.Locale]:
"""Returns a list of :py:obj:`babel.Locale` with languages from
:py:obj:`babel.languages.get_official_languages`.
:param territory: The territory (country or region) code.
:param languages: A list of language codes the languages from
:py:obj:`babel.languages.get_official_languages` should be in
(intersection). If this argument is ``None``, all official languages in
this territory are used.
:param regional: If the regional flag is set, then languages which are
regionally official are also returned.
:param de_facto: If the de_facto flag is set to `False`, then languages
which are de facto official are not returned.
"""
ret_val = set()
o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto)
if languages:
languages = [l.lower() for l in languages]
o_languages = set(l for l in o_languages if l.lower() in languages)
for lang in o_languages:
try:
locale = babel.Locale.parse(lang + '_' + territory)
ret_val.add(locale)
except babel.UnknownLocaleError:
continue
return ret_val
def get_engine_locale(searxng_locale, engine_locales, default=None):
"""Return engine's language (aka locale) string that best fits to argument
``searxng_locale``.
@ -177,6 +242,10 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
...
'pl-PL' : 'pl_PL',
'pt-PT' : 'pt_PT'
..
'zh' : 'zh'
'zh_Hans' : 'zh'
'zh_Hant' : 'zh-classical'
}
.. hint::
@ -210,13 +279,13 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
engine.
"""
# pylint: disable=too-many-branches
# pylint: disable=too-many-branches, too-many-return-statements
engine_locale = engine_locales.get(searxng_locale)
if engine_locale is not None:
# There was a 1:1 mapping (e.g. "fr-BE --> fr_BE" or "fr --> fr_FR"), no
# need to narrow language nor territory.
# There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language
# "zh --> zh"), no need to narrow language-script nor territory.
return engine_locale
try:
@ -227,6 +296,12 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
except babel.core.UnknownLocaleError:
return default
searxng_lang = language_tag(locale)
engine_locale = engine_locales.get(searxng_lang)
if engine_locale is not None:
# There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans")
return engine_locale
# SearXNG's selected locale is not supported by the engine ..
if locale.territory:
@ -247,10 +322,6 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
if locale.language:
searxng_lang = locale.language
if locale.script:
searxng_lang += '_' + locale.script
terr_lang_dict = {}
for territory, langs in babel.core.get_global("territory_languages").items():
if not langs.get(searxng_lang, {}).get('official_status'):
@ -303,3 +374,98 @@ def get_engine_locale(searxng_locale, engine_locales, default=None):
engine_locale = default
return default
def match_locale(searxng_locale: str, locale_tag_list: List[str], fallback: Optional[str] = None) -> Optional[str]:
"""Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.
:param str searxng_locale: SearXNG's internal representation of locale (de,
de-DE, fr-BE, zh, zh-CN, zh-TW ..).
:param list locale_tag_list: The list of locale tags to select from
:param str fallback: fallback locale tag (if unset --> ``None``)
The rules to find a match are implemented in :py:obj:`get_engine_locale`,
the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.
.. hint::
The *SearXNG locale* string and the members of ``locale_tag_list`` has to
be known by babel! The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
UI and are not known by babel --> will be ignored.
"""
# searxng_locale = 'es'
# locale_tag_list = ['es-AR', 'es-ES', 'es-MX']
if not searxng_locale:
return fallback
locale = get_locale(searxng_locale)
if locale is None:
return fallback
# normalize to a SearXNG locale that can be passed to get_engine_locale
searxng_locale = language_tag(locale)
if locale.territory:
searxng_locale = region_tag(locale)
# clean up locale_tag_list
tag_list = []
for tag in locale_tag_list:
if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
continue
tag_list.append(tag)
# emulate fetch_traits
engine_locales = build_engine_locales(tag_list)
return get_engine_locale(searxng_locale, engine_locales, default=fallback)
def build_engine_locales(tag_list: List[str]):
"""From a list of locale tags a dictionary is build that can be passed by
argument ``engine_locales`` to :py:obj:`get_engine_locale`. This function
is mainly used by :py:obj:`match_locale` and is similar to what the
``fetch_traits(..)`` function of engines do.
If there are territory codes in the ``tag_list`` that have a *script code*
additional keys are added to the returned dictionary.
.. code:: python
>>> import locales
>>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
>>> engine_locales
{
'en': 'en', 'en-US': 'en-US',
'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
}
>>> get_engine_locale('zh-Hans', engine_locales)
'zh-CN'
This function is a good example to understand the language/region model
of SearXNG:
SearXNG only distinguishes between **search languages** and **search
regions**, by adding the *script-tags*, languages with *script-tags* can
be assigned to the **regions** that SearXNG supports.
"""
engine_locales = {}
for tag in tag_list:
locale = get_locale(tag)
if locale is None:
logger.warn("build_engine_locales: skip locale tag %s / unknown by babel", tag)
continue
if locale.territory:
engine_locales[region_tag(locale)] = tag
if locale.script:
engine_locales[language_tag(locale)] = tag
else:
engine_locales[language_tag(locale)] = tag
return engine_locales

View File

@ -13,7 +13,7 @@ from typing import Iterable, Dict, List
import flask
from searx import settings, autocomplete
from searx.engines import Engine
from searx.enginelib import Engine
from searx.plugins import Plugin
from searx.locales import LOCALE_NAMES
from searx.webutils import VALID_LANGUAGE_CODE

View File

@ -4,7 +4,7 @@ from abc import abstractmethod, ABC
import re
from searx import settings
from searx.languages import language_codes
from searx.sxng_locales import sxng_locales
from searx.engines import categories, engines, engine_shortcuts
from searx.external_bang import get_bang_definition_and_autocomplete
from searx.search import EngineRef
@ -84,7 +84,7 @@ class LanguageParser(QueryPartParser):
found = False
# check if any language-code is equal with
# declared language-codes
for lc in language_codes:
for lc in sxng_locales:
lang_id, lang_name, country, english_name, _flag = map(str.lower, lc)
# if correct language-code is found
@ -125,7 +125,7 @@ class LanguageParser(QueryPartParser):
self.raw_text_query.autocomplete_list.append(lang)
return
for lc in language_codes:
for lc in sxng_locales:
if lc[0] not in settings['search']['languages']:
continue
lang_id, lang_name, country, english_name, _flag = map(str.lower, lc)

View File

@ -30,7 +30,10 @@ from .abstract import EngineProcessor
logger = logger.getChild('search.processors')
PROCESSORS: Dict[str, EngineProcessor] = {}
"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)"""
"""Cache request processores, stored by *engine-name* (:py:func:`initialize`)
:meta hide-value:
"""
def get_processor_class(engine_type):

View File

@ -138,7 +138,8 @@ class EngineProcessor(ABC):
return False
def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if request is not supported.
"""Returns a set of (see :ref:`request params <engine request arguments>`) or
``None`` if request is not supported.
Not supported conditions (``None`` is returned):
@ -159,11 +160,20 @@ class EngineProcessor(ABC):
params['safesearch'] = search_query.safesearch
params['time_range'] = search_query.time_range
params['engine_data'] = search_query.engine_data.get(self.engine_name, {})
params['searxng_locale'] = search_query.lang
# deprecated / vintage --> use params['searxng_locale']
#
# Conditions related to engine's traits are implemented in engine.traits
# module. Don't do 'locale' decissions here in the abstract layer of the
# search processor, just pass the value from user's choice unchanged to
# the engine request.
if hasattr(self.engine, 'language') and self.engine.language:
params['language'] = self.engine.language
else:
params['language'] = search_query.lang
return params
@abstractmethod

View File

@ -51,6 +51,9 @@ class OnlineProcessor(EngineProcessor):
super().initialize()
def get_params(self, search_query, engine_category):
"""Returns a set of :ref:`request params <engine request online>` or ``None``
if request is not supported.
"""
params = super().get_params(search_query, engine_category)
if params is None:
return None
@ -184,11 +187,6 @@ class OnlineProcessor(EngineProcessor):
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('CAPTCHA')
except SearxEngineTooManyRequestsException as e:
if "google" in self.engine_name:
self.logger.warn(
"Set to 'true' the use_mobile_ui parameter in the 'engines:'"
" section of your settings.yml file if google is blocked for you."
)
self.handle_exception(result_container, e, suspend=True)
self.logger.exception('Too many requests')
except SearxEngineAccessDeniedException as e:
@ -223,7 +221,7 @@ class OnlineProcessor(EngineProcessor):
'test': ['unique_results'],
}
if getattr(self.engine, 'supported_languages', []):
if getattr(self.engine, 'traits', False):
tests['lang_fr'] = {
'matrix': {'query': 'paris', 'lang': 'fr'},
'result_container': ['not_empty', ('has_language', 'fr')],

View File

@ -38,8 +38,8 @@ class OnlineCurrencyProcessor(OnlineProcessor):
engine_type = 'online_currency'
def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match
to :py:obj:`parser_re`."""
"""Returns a set of :ref:`request params <engine request online_currency>`
or ``None`` if search query does not match to :py:obj:`parser_re`."""
params = super().get_params(search_query, engine_category)
if params is None:

View File

@ -18,8 +18,9 @@ class OnlineDictionaryProcessor(OnlineProcessor):
engine_type = 'online_dictionary'
def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match
to :py:obj:`parser_re`."""
"""Returns a set of :ref:`request params <engine request online_dictionary>` or
``None`` if search query does not match to :py:obj:`parser_re`.
"""
params = super().get_params(search_query, engine_category)
if params is None:
return None

View File

@ -20,9 +20,10 @@ class OnlineUrlSearchProcessor(OnlineProcessor):
engine_type = 'online_url_search'
def get_params(self, search_query, engine_category):
"""Returns a set of *request params* or ``None`` if search query does not match
to at least one of :py:obj:`re_search_urls`.
"""Returns a set of :ref:`request params <engine request online>` or ``None`` if
search query does not match to :py:obj:`re_search_urls`.
"""
params = super().get_params(search_query, engine_category)
if params is None:
return None

View File

@ -731,22 +731,9 @@ engines:
- name: google
engine: google
shortcut: go
# see https://docs.searxng.org/src/searx.engines.google.html#module-searx.engines.google
use_mobile_ui: false
# additional_tests:
# android: *test_android
# - name: google italian
# engine: google
# shortcut: goit
# use_mobile_ui: false
# language: it
# - name: google mobile ui
# engine: google
# shortcut: gomui
# use_mobile_ui: true
- name: google images
engine: google_images
shortcut: goi
@ -1758,9 +1745,8 @@ engines:
engine: peertube
shortcut: ptb
paging: true
# https://instances.joinpeertube.org/instances
base_url: https://peertube.biz/
# base_url: https://tube.tardis.world/
# alternatives see: https://instances.joinpeertube.org/instances
# base_url: https://tube.4aem.com
categories: videos
disabled: true
timeout: 6.0

View File

@ -12,13 +12,13 @@ import logging
from base64 import b64decode
from os.path import dirname, abspath
from searx.languages import language_codes as languages
from .sxng_locales import sxng_locales
searx_dir = abspath(dirname(__file__))
logger = logging.getLogger('searx')
OUTPUT_FORMATS = ['html', 'csv', 'json', 'rss']
LANGUAGE_CODES = ['all', 'auto'] + list(l[0] for l in languages)
SXNG_LOCALE_TAGS = ['all', 'auto'] + list(l[0] for l in sxng_locales)
SIMPLE_STYLE = ('auto', 'light', 'dark')
CATEGORIES_AS_TABS = {
'general': {},
@ -156,8 +156,8 @@ SCHEMA = {
'safe_search': SettingsValue((0, 1, 2), 0),
'autocomplete': SettingsValue(str, ''),
'autocomplete_min': SettingsValue(int, 4),
'default_lang': SettingsValue(tuple(LANGUAGE_CODES + ['']), ''),
'languages': SettingSublistValue(LANGUAGE_CODES, LANGUAGE_CODES),
'default_lang': SettingsValue(tuple(SXNG_LOCALE_TAGS + ['']), ''),
'languages': SettingSublistValue(SXNG_LOCALE_TAGS, SXNG_LOCALE_TAGS),
'ban_time_on_fail': SettingsValue(numbers.Real, 5),
'max_ban_time_on_fail': SettingsValue(numbers.Real, 120),
'suspended_times': {

View File

@ -1,73 +1,120 @@
# -*- coding: utf-8 -*-
# list of language codes
# this file is generated automatically by utils/fetch_languages.py
language_codes = (
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
'''List of SearXNG's locale codes.
This file is generated automatically by::
./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
'''
sxng_locales = (
('ar', 'العربية', '', 'Arabic', '\U0001f310'),
('bg', 'Български', '', 'Bulgarian', '\U0001f310'),
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
('ca', 'Català', '', 'Catalan', '\U0001f310'),
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
('cs', 'Čeština', '', 'Czech', '\U0001f310'),
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
('da', 'Dansk', '', 'Danish', '\U0001f310'),
('da-DK', 'Dansk', 'Danmark', 'Danish', '\U0001f1e9\U0001f1f0'),
('de', 'Deutsch', '', 'German', '\U0001f310'),
('de-AT', 'Deutsch', 'Österreich', 'German', '\U0001f1e6\U0001f1f9'),
('de-CH', 'Deutsch', 'Schweiz', 'German', '\U0001f1e8\U0001f1ed'),
('de-DE', 'Deutsch', 'Deutschland', 'German', '\U0001f1e9\U0001f1ea'),
('el', 'Ελληνικά', '', 'Greek', '\U0001f310'),
('el-GR', 'Ελληνικά', 'Ελλάδα', 'Greek', '\U0001f1ec\U0001f1f7'),
('en', 'English', '', 'English', '\U0001f310'),
('en-AU', 'English', 'Australia', 'English', '\U0001f1e6\U0001f1fa'),
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
('en-GB', 'English', 'United Kingdom', 'English', '\U0001f1ec\U0001f1e7'),
('en-IE', 'English', 'Ireland', 'English', '\U0001f1ee\U0001f1ea'),
('en-IN', 'English', 'India', 'English', '\U0001f1ee\U0001f1f3'),
('en-MY', 'English', 'Malaysia', 'English', '\U0001f1f2\U0001f1fe'),
('en-NZ', 'English', 'New Zealand', 'English', '\U0001f1f3\U0001f1ff'),
('en-PH', 'English', 'Philippines', 'English', '\U0001f1f5\U0001f1ed'),
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
('en-ZA', 'English', 'South Africa', 'English', '\U0001f1ff\U0001f1e6'),
('es', 'Español', '', 'Spanish', '\U0001f310'),
('es-AR', 'Español', 'Argentina', 'Spanish', '\U0001f1e6\U0001f1f7'),
('es-CL', 'Español', 'Chile', 'Spanish', '\U0001f1e8\U0001f1f1'),
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
('es-US', 'Español', 'Estados Unidos', 'Spanish', '\U0001f1fa\U0001f1f8'),
('et', 'Eesti', '', 'Estonian', '\U0001f310'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
('fi', 'Suomi', '', 'Finnish', '\U0001f310'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
('he', 'עברית', '', 'Hebrew', '\U0001f1ee\U0001f1f7'),
('hi', 'हिन्दी', '', 'Hindi', '\U0001f310'),
('hr', 'Hrvatski', '', 'Croatian', '\U0001f310'),
('hu', 'Magyar', '', 'Hungarian', '\U0001f310'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('id', 'Indonesia', '', 'Indonesian', '\U0001f310'),
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
('is', 'Íslenska', '', 'Icelandic', '\U0001f310'),
('it', 'Italiano', '', 'Italian', '\U0001f310'),
('it-CH', 'Italiano', 'Svizzera', 'Italian', '\U0001f1e8\U0001f1ed'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
('ja', '日本語', '', 'Japanese', '\U0001f310'),
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
('ko', '한국어', '', 'Korean', '\U0001f310'),
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
('lt-LT', 'Lietuvių', 'Lietuva', 'Lithuanian', '\U0001f1f1\U0001f1f9'),
('lv-LV', 'Latviešu', 'Latvija', 'Latvian', '\U0001f1f1\U0001f1fb'),
('lt', 'Lietuvių', '', 'Lithuanian', '\U0001f310'),
('lv', 'Latviešu', '', 'Latvian', '\U0001f310'),
('nb', 'Norsk Bokmål', '', 'Norwegian Bokmål', '\U0001f310'),
('nb-NO', 'Norsk Bokmål', 'Norge', 'Norwegian Bokmål', '\U0001f1f3\U0001f1f4'),
('nl', 'Nederlands', '', 'Dutch', '\U0001f310'),
('nl-BE', 'Nederlands', 'België', 'Dutch', '\U0001f1e7\U0001f1ea'),
('nl-NL', 'Nederlands', 'Nederland', 'Dutch', '\U0001f1f3\U0001f1f1'),
('no-NO', 'Norsk', '', 'Norwegian (Bokmål)', '\U0001f1f3\U0001f1f4'),
('pl', 'Polski', '', 'Polish', '\U0001f310'),
('pl-PL', 'Polski', 'Polska', 'Polish', '\U0001f1f5\U0001f1f1'),
('pt', 'Português', '', 'Portuguese', '\U0001f310'),
('pt-BR', 'Português', 'Brasil', 'Portuguese', '\U0001f1e7\U0001f1f7'),
('pt-PT', 'Português', 'Portugal', 'Portuguese', '\U0001f1f5\U0001f1f9'),
('ro', 'Română', '', 'Romanian', '\U0001f310'),
('ro-RO', 'Română', 'România', 'Romanian', '\U0001f1f7\U0001f1f4'),
('ru', 'Русский', '', 'Russian', '\U0001f310'),
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
('sk', 'Slovenčina', '', 'Slovak', '\U0001f310'),
('sl', 'Slovenščina', '', 'Slovenian', '\U0001f310'),
('sr', 'Српски', '', 'Serbian', '\U0001f310'),
('sv', 'Svenska', '', 'Swedish', '\U0001f310'),
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
('th', 'ไทย', '', 'Thai', '\U0001f310'),
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
('tr', 'Türkçe', '', 'Turkish', '\U0001f310'),
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
('uk', 'Українська', '', 'Ukrainian', '\U0001f310'),
('vi', 'Tiếng Việt', '', 'Vietnamese', '\U0001f310'),
('zh', '中文', '', 'Chinese', '\U0001f310'),
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
('zh-HK', '中文', '中國香港', 'Chinese', '\U0001f1ed\U0001f1f0'),
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),
('zh-TW', '中文', '台灣', 'Chinese', '\U0001f1f9\U0001f1fc'),
)
'''
A list of five-digit tuples:
0. SearXNG's internal locale tag (a language or region tag)
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
Empty string for language tags.
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
are represented by a globe (🌐)
.. code:: python
('en', 'English', '', 'English', '🌐'),
('en-CA', 'English', 'Canada', 'English', '🇨🇦'),
('en-US', 'English', 'United States', 'English', '🇺🇸'),
..
('fr', 'Français', '', 'French', '🌐'),
('fr-BE', 'Français', 'Belgique', 'French', '🇧🇪'),
('fr-CA', 'Français', 'Canada', 'French', '🇨🇦'),
:meta hide-value:
'''

View File

@ -1,12 +1,12 @@
<select class="language" id="language" name="language" aria-label="{{ _('Search language') }}">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>
{{- _('Auto-detect') -}}
{%- if current_language == 'auto' %} ({{ search_language }}){%- endif -%}
</option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}
{%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%}
<option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>
{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]
</option>
{%- endfor -%}
</select>

View File

@ -115,10 +115,10 @@
<legend id="pref_language">{{ _('Search language') }}</legend>
<p class="value">{{- '' -}}
<select name='language' aria-labelledby="pref_language" aria-describedby="desc_language">{{- '' -}}
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }}</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }}</option>
{%- for lang_id,lang_name,country_name,english_name,flag in language_codes | sort(attribute=1) -%}
<option value="{{ lang_id }}" {% if lang_id == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %}({{ country_name }}) {% endif %}</option>
<option value="all" {% if current_language == 'all' %}selected="selected"{% endif %}>{{ _('Default language') }} [all]</option>
<option value="auto" {% if current_language == 'auto' %}selected="selected"{% endif %}>{{ _('Auto-detect') }} [auto]</option>
{%- for sxng_tag,lang_name,country_name,english_name,flag in sxng_locales | sort(attribute=1) -%}
<option value="{{ sxng_tag }}" {% if sxng_tag == current_language %}selected="selected"{% endif %}>{% if flag %}{{ flag }} {% endif%} {{- lang_name }} {% if country_name %} - {{ country_name }} {% endif %} [{{sxng_tag}}]</option>
{%- endfor -%}
</select>{{- '' -}}
</p>

View File

@ -18,13 +18,11 @@ from urllib.parse import urljoin, urlparse
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
from babel.core import get_global
from searx import settings
from searx.data import USER_AGENTS, data_dir
from searx.version import VERSION_TAG
from searx.languages import language_codes
from searx.sxng_locales import sxng_locales
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger
@ -53,8 +51,8 @@ _LANG_TO_LC_CACHE: Dict[str, Dict[str, str]] = {}
_FASTTEXT_MODEL: Optional["fasttext.FastText._FastText"] = None
"""fasttext model to predict laguage of a search term"""
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in language_codes])
"""Languages supported by most searxng engines (:py:obj:`searx.languages.language_codes`)."""
SEARCH_LANGUAGE_CODES = frozenset([searxng_locale[0].split('-')[0] for searxng_locale in sxng_locales])
"""Languages supported by most searxng engines (:py:obj:`searx.sxng_locales.sxng_locales`)."""
class _NotSetClass: # pylint: disable=too-few-public-methods
@ -355,102 +353,16 @@ def is_valid_lang(lang) -> Optional[Tuple[bool, str, str]]:
is_abbr = len(lang) == 2
lang = lang.lower()
if is_abbr:
for l in language_codes:
for l in sxng_locales:
if l[0][:2] == lang:
return (True, l[0][:2], l[3].lower())
return None
for l in language_codes:
for l in sxng_locales:
if l[1].lower() == lang or l[3].lower() == lang:
return (True, l[0][:2], l[3].lower())
return None
def _get_lang_to_lc_dict(lang_list: List[str]) -> Dict[str, str]:
key = str(lang_list)
value = _LANG_TO_LC_CACHE.get(key, None)
if value is None:
value = {}
for lang in lang_list:
value.setdefault(lang.split('-')[0], lang)
_LANG_TO_LC_CACHE[key] = value
return value
# babel's get_global contains all sorts of miscellaneous locale and territory related data
# see get_global in: https://github.com/python-babel/babel/blob/master/babel/core.py
def _get_from_babel(lang_code: str, key):
match = get_global(key).get(lang_code.replace('-', '_'))
# for some keys, such as territory_aliases, match may be a list
if isinstance(match, str):
return match.replace('_', '-')
return match
def _match_language(lang_code: str, lang_list=[], custom_aliases={}) -> Optional[str]: # pylint: disable=W0102
"""auxiliary function to match lang_code in lang_list"""
# replace language code with a custom alias if necessary
if lang_code in custom_aliases:
lang_code = custom_aliases[lang_code]
if lang_code in lang_list:
return lang_code
# try to get the most likely country for this language
subtags = _get_from_babel(lang_code, 'likely_subtags')
if subtags:
if subtags in lang_list:
return subtags
subtag_parts = subtags.split('-')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
if new_code in custom_aliases:
new_code = custom_aliases[new_code]
if new_code in lang_list:
return new_code
# try to get the any supported country for this language
return _get_lang_to_lc_dict(lang_list).get(lang_code)
def match_language( # pylint: disable=W0102
locale_code, lang_list=[], custom_aliases={}, fallback: Optional[str] = 'en-US'
) -> Optional[str]:
"""get the language code from lang_list that best matches locale_code"""
# try to get language from given locale_code
language = _match_language(locale_code, lang_list, custom_aliases)
if language:
return language
locale_parts = locale_code.split('-')
lang_code = locale_parts[0]
# if locale_code has script, try matching without it
if len(locale_parts) > 2:
language = _match_language(lang_code + '-' + locale_parts[-1], lang_list, custom_aliases)
if language:
return language
# try to get language using an equivalent country code
if len(locale_parts) > 1:
country_alias = _get_from_babel(locale_parts[-1], 'territory_aliases')
if country_alias:
language = _match_language(lang_code + '-' + country_alias[0], lang_list, custom_aliases)
if language:
return language
# try to get language using an equivalent language code
alias = _get_from_babel(lang_code, 'language_aliases')
if alias:
language = _match_language(alias, lang_list, custom_aliases)
if language:
return language
if lang_code != locale_code:
# try to get language from given language without giving the country
language = _match_language(lang_code, lang_list, custom_aliases)
return language or fallback
def load_module(filename: str, module_dir: str) -> types.ModuleType:
modname = splitext(filename)[0]
modpath = join(module_dir, filename)

View File

@ -89,7 +89,6 @@ from searx.utils import (
html_to_text,
gen_useragent,
dict_subset,
match_language,
)
from searx.version import VERSION_STRING, GIT_URL, GIT_BRANCH
from searx.query import RawTextQuery
@ -117,12 +116,13 @@ from searx.locales import (
RTL_LOCALES,
localeselector,
locales_initialize,
match_locale,
)
# renaming names from searx imports ...
from searx.autocomplete import search_autocomplete, backends as autocomplete_backends
from searx.languages import language_codes as languages
from searx.redisdb import initialize as redis_initialize
from searx.sxng_locales import sxng_locales
from searx.search import SearchWithPlugins, initialize as search_initialize
from searx.network import stream as http_stream, set_context_network_name
from searx.search.checker import get_result as checker_get_result
@ -227,7 +227,7 @@ def _get_browser_language(req, lang_list):
if '-' in lang:
lang_parts = lang.split('-')
lang = "{}-{}".format(lang_parts[0], lang_parts[-1].upper())
locale = match_language(lang, lang_list, fallback=None)
locale = match_locale(lang, lang_list, fallback=None)
if locale is not None:
return locale
return 'en'
@ -407,7 +407,7 @@ def get_client_settings():
def render(template_name: str, **kwargs):
# pylint: disable=too-many-statements
kwargs['client_settings'] = str(
base64.b64encode(
bytes(
@ -438,17 +438,20 @@ def render(template_name: str, **kwargs):
kwargs['OTHER_CATEGORY'] = OTHER_CATEGORY
# i18n
kwargs['language_codes'] = [l for l in languages if l[0] in settings['search']['languages']]
kwargs['sxng_locales'] = [l for l in sxng_locales if l[0] in settings['search']['languages']]
locale = request.preferences.get_value('locale')
kwargs['locale_rfc5646'] = _get_locale_rfc5646(locale)
if locale in RTL_LOCALES and 'rtl' not in kwargs:
kwargs['rtl'] = True
if 'current_language' not in kwargs:
kwargs['current_language'] = match_language(
request.preferences.get_value('language'), settings['search']['languages']
)
_locale = request.preferences.get_value('language')
if _locale in ('auto', 'all'):
kwargs['current_language'] = _locale
else:
kwargs['current_language'] = match_locale(_locale, settings['search']['languages'])
# values from settings
kwargs['search_formats'] = [x for x in settings['search']['formats'] if x != 'html']
@ -810,6 +813,13 @@ def search():
)
)
if search_query.lang in ('auto', 'all'):
current_language = search_query.lang
else:
current_language = match_locale(
search_query.lang, settings['search']['languages'], fallback=request.preferences.get_value("language")
)
# search_query.lang contains the user choice (all, auto, en, ...)
# when the user choice is "auto", search.search_query.lang contains the detected language
# otherwise it is equals to search_query.lang
@ -832,12 +842,8 @@ def search():
result_container.unresponsive_engines
),
current_locale = request.preferences.get_value("locale"),
current_language = match_language(
search_query.lang,
settings['search']['languages'],
fallback=request.preferences.get_value("language")
),
search_language = match_language(
current_language = current_language,
search_language = match_locale(
search.search_query.lang,
settings['search']['languages'],
fallback=request.preferences.get_value("language")
@ -907,16 +913,11 @@ def autocompleter():
# and there is a query part
if len(raw_text_query.autocomplete_list) == 0 and len(sug_prefix) > 0:
# get language from cookie
language = request.preferences.get_value('language')
if not language or language == 'all':
language = 'en'
else:
language = language.split('-')[0]
# get SearXNG's locale and autocomplete backend from cookie
sxng_locale = request.preferences.get_value('language')
backend_name = request.preferences.get_value('autocomplete')
# run autocompletion
raw_results = search_autocomplete(request.preferences.get_value('autocomplete'), sug_prefix, language)
for result in raw_results:
for result in search_autocomplete(backend_name, sug_prefix, sxng_locale):
# attention: this loop will change raw_text_query object and this is
# the reason why the sug_prefix was stored before (see above)
if result != sug_prefix:
@ -1001,7 +1002,9 @@ def preferences():
'rate80': rate80,
'rate95': rate95,
'warn_timeout': e.timeout > settings['outgoing']['request_timeout'],
'supports_selected_language': _is_selected_language_supported(e, request.preferences),
'supports_selected_language': e.traits.is_locale_supported(
str(request.preferences.get_value('language') or 'all')
),
'result_count': result_count,
}
# end of stats
@ -1052,7 +1055,9 @@ def preferences():
# supports
supports = {}
for _, e in filtered_engines.items():
supports_selected_language = _is_selected_language_supported(e, request.preferences)
supports_selected_language = e.traits.is_locale_supported(
str(request.preferences.get_value('language') or 'all')
)
safesearch = e.safesearch
time_range_support = e.time_range_support
for checker_test_name in checker_results.get(e.name, {}).get('errors', {}):
@ -1099,16 +1104,6 @@ def preferences():
)
def _is_selected_language_supported(engine, preferences: Preferences): # pylint: disable=redefined-outer-name
language = preferences.get_value('language')
if language == 'all':
return True
x = match_language(
language, getattr(engine, 'supported_languages', []), getattr(engine, 'language_aliases', {}), None
)
return bool(x)
@app.route('/image_proxy', methods=['GET'])
def image_proxy():
# pylint: disable=too-many-return-statements, too-many-branches
@ -1327,10 +1322,7 @@ def config():
if not request.preferences.validate_token(engine):
continue
supported_languages = engine.supported_languages
if isinstance(engine.supported_languages, dict):
supported_languages = list(engine.supported_languages.keys())
_languages = engine.traits.languages.keys()
_engines.append(
{
'name': name,
@ -1339,7 +1331,8 @@ def config():
'enabled': not engine.disabled,
'paging': engine.paging,
'language_support': engine.language_support,
'supported_languages': supported_languages,
'languages': list(_languages),
'regions': list(engine.traits.regions.keys()),
'safesearch': engine.safesearch,
'time_range_support': engine.time_range_support,
'timeout': engine.timeout,

View File

@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
import os
import pathlib
import csv
@ -8,7 +10,7 @@ import re
import inspect
import itertools
from datetime import datetime, timedelta
from typing import Iterable, List, Tuple, Dict
from typing import Iterable, List, Tuple, Dict, TYPE_CHECKING
from io import StringIO
from codecs import getincrementalencoder
@ -16,7 +18,10 @@ from codecs import getincrementalencoder
from flask_babel import gettext, format_date
from searx import logger, settings
from searx.engines import Engine, OTHER_CATEGORY
from searx.engines import OTHER_CATEGORY
if TYPE_CHECKING:
from searx.enginelib import Engine
VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(-[a-zA-Z]{2})?$')

View File

@ -18,8 +18,8 @@ from os.path import join
from lxml.html import fromstring
from searx.engines import wikidata, set_loggers
from searx.utils import extract_text, match_language
from searx.locales import LOCALE_NAMES, locales_initialize
from searx.utils import extract_text
from searx.locales import LOCALE_NAMES, locales_initialize, match_locale
from searx import searx_dir
from searx.utils import gen_useragent, detect_language
import searx.search
@ -225,9 +225,9 @@ def fetch_website_description(engine_name, website):
fetched_lang, desc = get_website_description(website, lang, WIKIPEDIA_LANGUAGES[lang])
if fetched_lang is None or desc is None:
continue
matched_lang = match_language(fetched_lang, LANGUAGES, fallback=None)
matched_lang = match_locale(fetched_lang, LANGUAGES, fallback=None)
if matched_lang is None:
fetched_wikipedia_lang = match_language(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
fetched_wikipedia_lang = match_locale(fetched_lang, WIKIPEDIA_LANGUAGES.values(), fallback=None)
matched_lang = wikipedia_languages_r.get(fetched_wikipedia_lang)
if matched_lang is not None:
update_description(engine_name, matched_lang, desc, website, replace=False)

View File

@ -0,0 +1,198 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Update :py:obj:`searx.enginelib.traits.EngineTraitsMap` and :origin:`searx/languages.py`
:py:obj:`searx.enginelib.traits.EngineTraitsMap.ENGINE_TRAITS_FILE`:
Persistence of engines traits, fetched from the engines.
:origin:`searx/languages.py`
Is generated from intersecting each engine's supported traits.
The script :origin:`searxng_extra/update/update_engine_traits.py` is called in
the :origin:`CI Update data ... <.github/workflows/data-update.yml>`
"""
# pylint: disable=invalid-name
from unicodedata import lookup
from pathlib import Path
from pprint import pformat
import babel
from searx import settings, searx_dir
from searx import network
from searx.engines import load_engines
from searx.enginelib.traits import EngineTraitsMap
# Output files.
languages_file = Path(searx_dir) / 'sxng_locales.py'
languages_file_header = """\
# -*- coding: utf-8 -*-
'''List of SearXNG's locale codes.
This file is generated automatically by::
./manage pyenv.cmd searxng_extra/update/update_engine_traits.py
'''
sxng_locales = (
"""
languages_file_footer = """,
)
'''
A list of five-digit tuples:
0. SearXNG's internal locale tag (a language or region tag)
1. Name of the language (:py:obj:`babel.core.Locale.get_language_name`)
2. For region tags the name of the region (:py:obj:`babel.core.Locale.get_territory_name`).
Empty string for language tags.
3. English language name (from :py:obj:`babel.core.Locale.english_name`)
4. Unicode flag (emoji) that fits to SearXNG's internal region tag. Languages
are represented by a globe (\U0001F310)
.. code:: python
('en', 'English', '', 'English', '\U0001f310'),
('en-CA', 'English', 'Canada', 'English', '\U0001f1e8\U0001f1e6'),
('en-US', 'English', 'United States', 'English', '\U0001f1fa\U0001f1f8'),
..
('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
:meta hide-value:
'''
"""
lang2emoji = {
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
'jp': '\U0001F1EF\U0001F1F5', # Japanese
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
'he': '\U0001F1EE\U0001F1F7', # Hebrew
}
def main():
load_engines(settings['engines'])
# traits_map = EngineTraitsMap.from_data()
traits_map = fetch_traits_map()
sxng_tag_list = filter_locales(traits_map)
write_languages_file(sxng_tag_list)
def fetch_traits_map():
"""Fetchs supported languages for each engine and writes json file with those."""
network.set_timeout_for_thread(10.0)
def log(msg):
print(msg)
traits_map = EngineTraitsMap.fetch_traits(log=log)
print("fetched properties from %s engines" % len(traits_map))
print("write json file: %s" % traits_map.ENGINE_TRAITS_FILE)
traits_map.save_data()
return traits_map
def filter_locales(traits_map: EngineTraitsMap):
"""Filter language & region tags by a threshold."""
min_eng_per_region = 11
min_eng_per_lang = 13
_ = {}
for eng in traits_map.values():
for reg in eng.regions.keys():
_[reg] = _.get(reg, 0) + 1
regions = set(k for k, v in _.items() if v >= min_eng_per_region)
lang_from_region = set(k.split('-')[0] for k in regions)
_ = {}
for eng in traits_map.values():
for lang in eng.languages.keys():
# ignore script types like zh_Hant, zh_Hans or sr_Latin, pa_Arab (they
# already counted by existence of 'zh' or 'sr', 'pa')
if '_' in lang:
# print("ignore %s" % lang)
continue
_[lang] = _.get(lang, 0) + 1
languages = set(k for k, v in _.items() if v >= min_eng_per_lang)
sxng_tag_list = set()
sxng_tag_list.update(regions)
sxng_tag_list.update(lang_from_region)
sxng_tag_list.update(languages)
return sxng_tag_list
def write_languages_file(sxng_tag_list):
language_codes = []
for sxng_tag in sorted(sxng_tag_list):
sxng_locale: babel.Locale = babel.Locale.parse(sxng_tag, sep='-')
flag = get_unicode_flag(sxng_locale) or ''
item = (
sxng_tag,
sxng_locale.get_language_name().title(),
sxng_locale.get_territory_name() or '',
sxng_locale.english_name.split(' (')[0],
UnicodeEscape(flag),
)
language_codes.append(item)
language_codes = tuple(language_codes)
with open(languages_file, 'w', encoding='utf-8') as new_file:
file_content = "{header} {language_codes}{footer}".format(
header=languages_file_header,
language_codes=pformat(language_codes, width=120, indent=4)[1:-1],
footer=languages_file_footer,
)
new_file.write(file_content)
new_file.close()
class UnicodeEscape(str):
"""Escape unicode string in :py:obj:`pprint.pformat`"""
def __repr__(self):
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
def get_unicode_flag(locale: babel.Locale):
"""Determine a unicode flag (emoji) that fits to the ``locale``"""
emoji = lang2emoji.get(locale.language)
if emoji:
return emoji
if not locale.territory:
return '\U0001F310'
emoji = lang2emoji.get(locale.territory.lower())
if emoji:
return emoji
try:
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[0])
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + locale.territory[1])
# print("OK : %s --> %s%s" % (locale, c1, c2))
except KeyError as exc:
print("ERROR: %s --> %s" % (locale, exc))
return None
return c1 + c2
if __name__ == "__main__":
main()

View File

@ -1,313 +0,0 @@
#!/usr/bin/env python
# lint: pylint
# SPDX-License-Identifier: AGPL-3.0-or-later
"""This script generates languages.py from intersecting each engine's supported
languages.
Output files: :origin:`searx/data/engines_languages.json` and
:origin:`searx/languages.py` (:origin:`CI Update data ...
<.github/workflows/data-update.yml>`).
"""
# pylint: disable=invalid-name
from unicodedata import lookup
import json
from pathlib import Path
from pprint import pformat
from babel import Locale, UnknownLocaleError
from babel.languages import get_global
from babel.core import parse_locale
from searx import settings, searx_dir
from searx.engines import load_engines, engines
from searx.network import set_timeout_for_thread
# Output files.
engines_languages_file = Path(searx_dir) / 'data' / 'engines_languages.json'
languages_file = Path(searx_dir) / 'languages.py'
# Fetches supported languages for each engine and writes json file with those.
def fetch_supported_languages():
set_timeout_for_thread(10.0)
engines_languages = {}
names = list(engines)
names.sort()
for engine_name in names:
if hasattr(engines[engine_name], 'fetch_supported_languages'):
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
print("fetched %s languages from engine %s" % (len(engines_languages[engine_name]), engine_name))
if type(engines_languages[engine_name]) == list: # pylint: disable=unidiomatic-typecheck
engines_languages[engine_name] = sorted(engines_languages[engine_name])
print("fetched languages from %s engines" % len(engines_languages))
# write json file
with open(engines_languages_file, 'w', encoding='utf-8') as f:
json.dump(engines_languages, f, indent=2, sort_keys=True)
return engines_languages
# Get babel Locale object from lang_code if possible.
def get_locale(lang_code):
try:
locale = Locale.parse(lang_code, sep='-')
return locale
except (UnknownLocaleError, ValueError):
return None
lang2emoji = {
'ha': '\U0001F1F3\U0001F1EA', # Hausa / Niger
'bs': '\U0001F1E7\U0001F1E6', # Bosnian / Bosnia & Herzegovina
'jp': '\U0001F1EF\U0001F1F5', # Japanese
'ua': '\U0001F1FA\U0001F1E6', # Ukrainian
'he': '\U0001F1EE\U0001F1F7', # Hebrew
}
def get_unicode_flag(lang_code):
"""Determine a unicode flag (emoji) that fits to the ``lang_code``"""
emoji = lang2emoji.get(lang_code.lower())
if emoji:
return emoji
if len(lang_code) == 2:
return '\U0001F310'
language = territory = script = variant = ''
try:
language, territory, script, variant = parse_locale(lang_code, '-')
except ValueError as exc:
print(exc)
# https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
if not territory:
# https://www.unicode.org/emoji/charts/emoji-list.html#country-flag
emoji = lang2emoji.get(language)
if not emoji:
print(
"%s --> language: %s / territory: %s / script: %s / variant: %s"
% (lang_code, language, territory, script, variant)
)
return emoji
emoji = lang2emoji.get(territory.lower())
if emoji:
return emoji
try:
c1 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[0])
c2 = lookup('REGIONAL INDICATOR SYMBOL LETTER ' + territory[1])
# print("%s --> territory: %s --> %s%s" %(lang_code, territory, c1, c2 ))
except KeyError as exc:
print("%s --> territory: %s --> %s" % (lang_code, territory, exc))
return None
return c1 + c2
def get_territory_name(lang_code):
country_name = None
locale = get_locale(lang_code)
try:
if locale is not None:
country_name = locale.get_territory_name()
except FileNotFoundError as exc:
print("ERROR: %s --> %s" % (locale, exc))
return country_name
# Join all language lists.
def join_language_lists(engines_languages):
language_list = {}
for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]:
# apply custom fixes if necessary
if lang_code in getattr(engines[engine_name], 'language_aliases', {}).values():
lang_code = next(
lc for lc, alias in engines[engine_name].language_aliases.items() if lang_code == alias
)
locale = get_locale(lang_code)
# ensure that lang_code uses standard language and country codes
if locale and locale.territory:
lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
short_code = lang_code.split('-')[0]
# add language without country if not in list
if short_code not in language_list:
if locale:
# get language's data from babel's Locale object
language_name = locale.get_language_name().title()
english_name = locale.english_name.split(' (')[0]
elif short_code in engines_languages['wikipedia']:
# get language's data from wikipedia if not known by babel
language_name = engines_languages['wikipedia'][short_code]['name']
english_name = engines_languages['wikipedia'][short_code]['english_name']
else:
language_name = None
english_name = None
# add language to list
language_list[short_code] = {
'name': language_name,
'english_name': english_name,
'counter': set(),
'countries': {},
}
# add language with country if not in list
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
country_name = ''
if locale:
# get country name from babel's Locale object
try:
country_name = locale.get_territory_name()
except FileNotFoundError as exc:
print("ERROR: %s --> %s" % (locale, exc))
locale = None
language_list[short_code]['countries'][lang_code] = {
'country_name': country_name,
'counter': set(),
}
# count engine for both language_country combination and language alone
language_list[short_code]['counter'].add(engine_name)
if lang_code != short_code:
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
return language_list
# Filter language list so it only includes the most supported languages and countries
def filter_language_list(all_languages):
min_engines_per_lang = 12
min_engines_per_country = 7
# pylint: disable=consider-using-dict-items, consider-iterating-dictionary
main_engines = [
engine_name
for engine_name in engines.keys()
if 'general' in engines[engine_name].categories
and engines[engine_name].supported_languages
and not engines[engine_name].disabled
]
# filter list to include only languages supported by most engines or all default general engines
filtered_languages = {
code: lang
for code, lang in all_languages.items()
if (
len(lang['counter']) >= min_engines_per_lang
or all(main_engine in lang['counter'] for main_engine in main_engines)
)
}
def _copy_lang_data(lang, country_name=None):
new_dict = {}
new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name']
if country_name:
new_dict['country_name'] = country_name
return new_dict
# for each language get country codes supported by most engines or at least one country code
filtered_languages_with_countries = {}
for lang, lang_data in filtered_languages.items():
countries = lang_data['countries']
filtered_countries = {}
# get language's country codes with enough supported engines
for lang_country, country_data in countries.items():
if len(country_data['counter']) >= min_engines_per_country:
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
# add language without countries too if there's more than one country to choose from
if len(filtered_countries) > 1:
filtered_countries[lang] = _copy_lang_data(lang, None)
elif len(filtered_countries) == 1:
lang_country = next(iter(filtered_countries))
# if no country has enough engines try to get most likely country code from babel
if not filtered_countries:
lang_country = None
subtags = get_global('likely_subtags').get(lang)
if subtags:
country_code = subtags.split('_')[-1]
if len(country_code) == 2:
lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
if lang_country:
filtered_countries[lang_country] = _copy_lang_data(lang, None)
else:
filtered_countries[lang] = _copy_lang_data(lang, None)
filtered_languages_with_countries.update(filtered_countries)
return filtered_languages_with_countries
class UnicodeEscape(str):
"""Escape unicode string in :py:obj:`pprint.pformat`"""
def __repr__(self):
return "'" + "".join([chr(c) for c in self.encode('unicode-escape')]) + "'"
# Write languages.py.
def write_languages_file(languages):
file_headers = (
"# -*- coding: utf-8 -*-",
"# list of language codes",
"# this file is generated automatically by utils/fetch_languages.py",
"language_codes = (\n",
)
language_codes = []
for code in sorted(languages):
name = languages[code]['name']
if name is None:
print("ERROR: languages['%s'] --> %s" % (code, languages[code]))
continue
flag = get_unicode_flag(code) or ''
item = (
code,
languages[code]['name'].split(' (')[0],
get_territory_name(code) or '',
languages[code].get('english_name') or '',
UnicodeEscape(flag),
)
language_codes.append(item)
language_codes = tuple(language_codes)
with open(languages_file, 'w', encoding='utf-8') as new_file:
file_content = "{file_headers} {language_codes},\n)\n".format(
# fmt: off
file_headers = '\n'.join(file_headers),
language_codes = pformat(language_codes, indent=4)[1:-1]
# fmt: on
)
new_file.write(file_content)
new_file.close()
if __name__ == "__main__":
load_engines(settings['engines'])
_engines_languages = fetch_supported_languages()
_all_languages = join_language_lists(_engines_languages)
_filtered_languages = filter_language_list(_all_languages)
write_languages_file(_filtered_languages)

View File

@ -50,7 +50,7 @@ from pathlib import Path
from searx import searx_dir
from searx.network import set_timeout_for_thread
from searx.engines import wikidata, set_loggers
from searx.languages import language_codes
from searx.sxng_locales import sxng_locales
from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK
set_loggers(wikidata, 'wikidata')
@ -76,7 +76,7 @@ GROUP BY ?key ?item ?itemLabel
ORDER BY ?key ?item ?itemLabel
"""
LANGUAGES = [l[0].lower() for l in language_codes]
LANGUAGES = [l[0].lower() for l in sxng_locales]
PRESET_KEYS = {
('wikidata',): {'en': 'Wikidata'},

111
tests/unit/test_locales.py Normal file
View File

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Test some code from module :py:obj:`searx.locales`"""
from searx import locales
from searx.sxng_locales import sxng_locales
from tests import SearxTestCase
class TestLocales(SearxTestCase):
"""Implemented tests:
- :py:obj:`searx.locales.match_locale`
"""
def test_match_locale(self):
locale_tag_list = [x[0] for x in sxng_locales]
# Test SearXNG search languages
self.assertEqual(locales.match_locale('de', locale_tag_list), 'de')
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr')
self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh')
# Test SearXNG search regions
self.assertEqual(locales.match_locale('ca-es', locale_tag_list), 'ca-ES')
self.assertEqual(locales.match_locale('de-at', locale_tag_list), 'de-AT')
self.assertEqual(locales.match_locale('de-de', locale_tag_list), 'de-DE')
self.assertEqual(locales.match_locale('en-UK', locale_tag_list), 'en-GB')
self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
self.assertEqual(locales.match_locale('fr-be', locale_tag_list), 'fr-BE')
self.assertEqual(locales.match_locale('fr-ca', locale_tag_list), 'fr-CA')
self.assertEqual(locales.match_locale('fr-ch', locale_tag_list), 'fr-CH')
self.assertEqual(locales.match_locale('zh-cn', locale_tag_list), 'zh-CN')
self.assertEqual(locales.match_locale('zh-tw', locale_tag_list), 'zh-TW')
self.assertEqual(locales.match_locale('zh-hk', locale_tag_list), 'zh-HK')
# Test language script code
self.assertEqual(locales.match_locale('zh-hans', locale_tag_list), 'zh-CN')
self.assertEqual(locales.match_locale('zh-hans-cn', locale_tag_list), 'zh-CN')
self.assertEqual(locales.match_locale('zh-hant', locale_tag_list), 'zh-TW')
self.assertEqual(locales.match_locale('zh-hant-tw', locale_tag_list), 'zh-TW')
# Test individual locale lists
self.assertEqual(locales.match_locale('es', [], fallback='fallback'), 'fallback')
self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
self.assertEqual(locales.match_locale('de', ['de-CH', 'de-DE']), 'de-DE')
self.assertEqual(locales.match_locale('es', ['ES']), 'ES')
self.assertEqual(locales.match_locale('es', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
self.assertEqual(locales.match_locale('es-AR', ['es-AR', 'es-ES', 'es-MX']), 'es-AR')
self.assertEqual(locales.match_locale('es-CO', ['es-AR', 'es-ES']), 'es-ES')
self.assertEqual(locales.match_locale('es-CO', ['es-AR']), 'es-AR')
# Tests from the commit message of 9ae409a05a
# Assumption:
# A. When a user selects a language the results should be optimized according to
# the selected language.
#
# B. When user selects a language and a territory the results should be
# optimized with first priority on territory and second on language.
# Assume we have an engine that supports the follwoing locales:
locale_tag_list = ['zh-CN', 'zh-HK', 'nl-BE', 'fr-CA']
# Examples (Assumption A.)
# ------------------------
# A user selects region 'zh-TW' which should end in zh_HK.
# hint: CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')
self.assertEqual(locales.match_locale('zh-TW', locale_tag_list), 'zh-HK')
# A user selects only the language 'zh' which should end in CN
self.assertEqual(locales.match_locale('zh', locale_tag_list), 'zh-CN')
# A user selects only the language 'fr' which should end in fr_CA
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-CA')
# The difference in priority on the territory is best shown with a
# engine that supports the following locales:
locale_tag_list = ['fr-FR', 'fr-CA', 'en-GB', 'nl-BE']
# A user selects only a language
self.assertEqual(locales.match_locale('en', locale_tag_list), 'en-GB')
# hint: the engine supports fr_FR and fr_CA since no territory is given,
# fr_FR takes priority ..
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-FR')
# Examples (Assumption B.)
# ------------------------
# A user selects region 'fr-BE' which should end in nl-BE
self.assertEqual(locales.match_locale('fr-BE', locale_tag_list), 'nl-BE')
# If the user selects a language and there are two locales like the
# following:
locale_tag_list = ['fr-BE', 'fr-CH']
# The get_engine_locale selects the locale by looking at the "population
# percent" and this percentage has an higher amount in BE (68.%)
# compared to CH (21%)
self.assertEqual(locales.match_locale('fr', locale_tag_list), 'fr-BE')

View File

@ -87,39 +87,6 @@ class TestUtils(SearxTestCase):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'
self.assertEqual(utils.html_to_text(html), "Lorem ipsum")
def test_match_language(self):
self.assertEqual(utils.match_language('es', ['es']), 'es')
self.assertEqual(utils.match_language('es', [], fallback='fallback'), 'fallback')
self.assertEqual(utils.match_language('ja', ['jp'], {'ja': 'jp'}), 'jp')
# handle script tags
self.assertEqual(utils.match_language('zh-CN', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hans-CN')
self.assertEqual(utils.match_language('zh-TW', ['zh-Hans-CN', 'zh-Hant-TW']), 'zh-Hant-TW')
self.assertEqual(utils.match_language('zh-Hans-CN', ['zh-CN', 'zh-TW']), 'zh-CN')
self.assertEqual(utils.match_language('zh-Hant-TW', ['zh-CN', 'zh-TW']), 'zh-TW')
self.assertEqual(utils.match_language('zh-Hans', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-CN')
self.assertEqual(utils.match_language('zh-Hant', ['zh-CN', 'zh-TW', 'zh-HK']), 'zh-TW')
aliases = {'en-GB': 'en-UK', 'he': 'iw'}
# guess country
self.assertEqual(utils.match_language('de-DE', ['de']), 'de')
self.assertEqual(utils.match_language('de', ['de-DE']), 'de-DE')
self.assertEqual(utils.match_language('es-CO', ['es-AR', 'es-ES', 'es-MX']), 'es-ES')
self.assertEqual(utils.match_language('es-CO', ['es-MX']), 'es-MX')
self.assertEqual(utils.match_language('en-UK', ['en-AU', 'en-GB', 'en-US']), 'en-GB')
self.assertEqual(utils.match_language('en-GB', ['en-AU', 'en-UK', 'en-US'], aliases), 'en-UK')
# language aliases
self.assertEqual(utils.match_language('iw', ['he']), 'he')
self.assertEqual(utils.match_language('he', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw-IL', ['he']), 'he')
self.assertEqual(utils.match_language('he-IL', ['iw'], aliases), 'iw')
self.assertEqual(utils.match_language('iw', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he', ['iw-IL'], aliases), 'iw-IL')
self.assertEqual(utils.match_language('iw-IL', ['he-IL']), 'he-IL')
self.assertEqual(utils.match_language('he-IL', ['iw-IL'], aliases), 'iw-IL')
def test_ecma_unscape(self):
self.assertEqual(utils.ecma_unescape('text%20with%20space'), 'text with space')
self.assertEqual(utils.ecma_unescape('text using %xx: %F3'), 'text using %xx: ó')

View File

@ -52,9 +52,6 @@ enabled_plugins:
engines:
- name: google
use_mobile_ui: true
# - name: fdroid
# disabled: false
#