searxngRebrandZaclys/searx/engines/bing.py

# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Bing (Web)

- https://github.com/searx/searx/issues/2019#issuecomment-648227442
"""
# pylint: disable=too-many-branches

import re
from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
from searx.network import multi_requests, Request

from searx.enginelib.traits import EngineTraits

traits: EngineTraits

about = {
    "website": 'https://www.bing.com',
    "wikidata_id": 'Q182496',
    "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
    "use_official_api": False,
    "require_api_key": False,
    "results": 'HTML',
}

# engine dependent config
categories = ['general', 'web']
paging = True
time_range_support = False
safesearch = False
send_accept_language_header = True
supported_languages_url = 'https://www.bing.com/account/general'
language_aliases = {}

# search-url
base_url = 'https://www.bing.com/'

# initial query:     https://www.bing.com/search?q=foo&search=&form=QBLH
inital_query = 'search?{query}&search=&form=QBLH'

# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
page_query = 'search?{query}&search=&first={offset}&FORM=PERE'


def _get_offset_from_pageno(pageno):
    return (pageno - 1) * 10 + 1


def request(query, params):

    offset = _get_offset_from_pageno(params.get('pageno', 1))

    # logger.debug("params['pageno'] --> %s", params.get('pageno'))
    # logger.debug("          offset --> %s", offset)

    search_string = page_query
    if offset == 1:
        search_string = inital_query

    if params['language'] == 'all':
        lang = 'EN'
    else:
        lang = match_language(params['language'], supported_languages, language_aliases)

    query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)

    search_path = search_string.format(query=urlencode({'q': query}), offset=offset)

    if offset > 1:
        referer = base_url + inital_query.format(query=urlencode({'q': query}))
        params['headers']['Referer'] = referer
        logger.debug("headers.Referer --> %s", referer)

    params['url'] = base_url + search_path
    params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    return params


def response(resp):
    results = []
    result_len = 0

    dom = html.fromstring(resp.text)

    # parse results again if nothing is found yet

    url_to_resolve = []
    url_to_resolve_index = []
    i = 0
    for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):

        link = eval_xpath_getindex(result, './/h2/a', 0, None)
        if link is None:
            continue
        url = link.attrib.get('href')
        title = extract_text(link)

        # Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
        content = eval_xpath(result, '(.//p)[1]')
        for p in content:
            for e in p.xpath('.//a'):
                e.getparent().remove(e)
            for e in p.xpath('.//span[@class="algoSlug_icon"]'):
                e.getparent().remove(e)
        content = extract_text(content)

        # get the real URL either using the URL shown to user or following the Bing URL
        if url.startswith('https://www.bing.com/ck/a?'):
            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
            # Bing can shorten the URL either at the end or in the middle of the string
            if (
                url_cite.startswith('https://')
                and '…' not in url_cite
                and '...' not in url_cite
                and '›' not in url_cite
            ):
                # no need for an additional HTTP request
                url = url_cite
            else:
                # resolve the URL with an additional HTTP request
                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
                url_to_resolve_index.append(i)
                url = None  # remove the result if the HTTP Bing redirect raise an exception

        # append result
        results.append({'url': url, 'title': title, 'content': content})
        # increment result pointer for the next iteration in this loop
        i += 1

    # resolve all Bing redirections in parallel
    request_list = [
        Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
    ]
    response_list = multi_requests(request_list)
    for i, redirect_response in enumerate(response_list):
        if not isinstance(redirect_response, Exception):
            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']

    # get number_of_results
    try:
        result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
        if "-" in result_len_container:

            # Remove the part "from-to" for paginated request ...
            result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]

        result_len_container = re.sub('[^0-9]', '', result_len_container)

        if len(result_len_container) > 0:
            result_len = int(result_len_container)

    except Exception as e:  # pylint: disable=broad-except
        logger.debug('result error :\n%s', e)

    if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
        return []

    results.append({'number_of_results': result_len})
    return results


# get supported languages from their site
def _fetch_supported_languages(resp):

    lang_tags = set()

    dom = html.fromstring(resp.text)
    lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')

    for _li in lang_links:

        href = eval_xpath(_li, './/@href')[0]
        (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
        query = parse_qs(query, keep_blank_values=True)

        # fmt: off
        setlang = query.get('setlang', [None, ])[0]
        # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
        lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2]  # fmt: skip
        # fmt: on

        tag = lang + '-' + nation if nation else lang
        lang_tags.add(tag)

    return list(lang_tags)


def fetch_traits(engine_traits: EngineTraits):
    """Fetch languages and regions from bing."""

    # pylint: disable=import-outside-toplevel, disable=too-many-branches,
    # pylint: disable=too-many-locals, too-many-statements

    engine_traits.data_type = 'supported_languages'  # deprecated

    import babel
    import babel.languages
    from searx import network
    from searx.locales import get_offical_locales, language_tag, region_tag
    from searx.utils import gen_useragent

    headers = {
        'User-Agent': gen_useragent(),
        'Accept-Language': "en-US,en;q=0.5",  # bing needs to set the English language
    }
    resp = network.get('https://www.bing.com/account/general', headers=headers)

    if not resp.ok:
        print("ERROR: response from peertube is not OK.")

    dom = html.fromstring(resp.text)

    # Selector to get items from "Display language"

    lang_map = {
        'prs': 'fa',  # Persian
        'pt_BR': 'pt',  # Portuguese (Brasil)
        'pt_PT': 'pt',  # Portuguese (Portugal)
        'ca-ES-VALENCIA': 'ca',  # Catalan (Spain, Valencian)
    }

    unknow_langs = [
        'quc',  # K'iche'
        'nso',  # Sesotho sa Leboa
        'tn',  # Setswana
    ]

    for div in eval_xpath(dom, '//div[@id="limit-languages"]//input/..'):

        eng_lang = eval_xpath(div, './/input/@value')[0]
        if eng_lang in unknow_langs:
            continue

        eng_lang = lang_map.get(eng_lang, eng_lang)
        label = extract_text(eval_xpath(div, './/label'))

        # The 'language:xx' query string in the request function (above) does
        # only support the language codes from the "Display languages" list.
        # Examples of items from the "Display languages" not sopported in the
        # query string: zh_Hans --> zh / sr_latn --> sr
        #
        # eng_lang = eng_lang.split('_')[0]

        try:
            sxng_tag = language_tag(babel.Locale.parse(eng_lang.replace('-', '_'), sep='_'))
        except babel.UnknownLocaleError:
            print("ERROR: %s (%s) is unknown by babel" % (label, eng_lang))
            continue

        conflict = engine_traits.languages.get(sxng_tag)
        if conflict:
            if conflict != eng_lang:
                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
            continue
        engine_traits.languages[sxng_tag] = eng_lang

    engine_traits.languages['zh'] = 'zh_Hans'

    # regiones

    for a in eval_xpath(dom, '//div[@id="region-section-content"]//li/a'):
        href = eval_xpath(a, './/@href')[0]
        # lang_name = extract_text(a)
        query = urlparse(href)[4]
        query = parse_qs(query, keep_blank_values=True)
        cc = query.get('cc')[0]  # pylint:disable=invalid-name
        if cc == 'clear':
            continue

        # Assert babel supports this locales
        sxng_locales = get_offical_locales(cc.upper(), engine_traits.languages.keys())

        if not sxng_locales:
            # print("ERROR: can't map from bing country %s (%s) to a babel region." % (a.text_content().strip(), cc))
            continue

        for sxng_locale in sxng_locales:
            engine_traits.regions[region_tag(sxng_locale)] = cc
-												[enh] engines: add about variable

move meta information from comment to the about variable
so the preferences, the documentation can show these information

											
										
										
											2021-01-13 10:31:25 +00:00
+								# SPDX-License-Identifier: AGPL-3.0-or-later
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
+								# lint: pylint
 								"""Bing (Web)
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
 								- https://github.com/searx/searx/issues/2019#issuecomment-648227442
-												update versions.cfg to use the current up-to-date packages

											
										
										
											2015-05-02 13:45:17 +00:00
+								"""
-												[fix] Bing-Web engine: XPath to get the wikipedia result

Modify the XPath selector to get the wikipedia result plus small fixes.

About result content: especially with the Wikipedia result, we'd get several
paragraph elements, only the first paragraph would be taken and displayed on the
search result

											
										
										
											2023-01-03 21:59:01 +00:00
+								# pylint: disable=too-many-branches
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								import re
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 15:47:47 +00:00
+								from urllib.parse import urlencode, urlparse, parse_qs
-												Improves PEP8 compatibility.

											
										
										
											2014-02-05 19:24:31 +00:00
+								from lxml import html
-												[fix] bing: parsing result; check to see if the element contains links

This patch is to hardening the parsing of the bing response:

1. To fix [2087] check if the selected result item contains a link, otherwise
   skip result item and continue in the result loop.  Increment the result
   pointer when a result has been added / the enumerate that counts for skipped
   items is no longer valid when result items are skipped.

   To test the bugfix use:   ``!bi :all cerbot``

2. Limit the XPath selection of result items to direct children nodes (list
   items ``li``) of the ordered list (``ol``).

   To test the selector use: ``!bi :en pontiac aztek wiki``

   .. in the result list you should find the wikipedia entry on top,
   compare [2068]

[2087] https://github.com/searxng/searxng/issues/2087
[2068] https://github.com/searxng/searxng/issues/2068

											
										
										
											2023-01-08 18:12:52 +00:00
+								from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 16:24:47 +00:00
+								from searx.network import multi_requests, Request
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
-												[mod] bing: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Bing engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-03 08:06:17 +00:00
+								from searx.enginelib.traits import EngineTraits
 								traits: EngineTraits
-												[enh] engines: add about variable

move meta information from comment to the about variable
so the preferences, the documentation can show these information

											
										
										
											2021-01-13 10:31:25 +00:00
+								about = {
 								    "website": 'https://www.bing.com',
 								    "wikidata_id": 'Q182496',
 								    "official_api_documentation": 'https://www.microsoft.com/en-us/bing/apis/bing-web-search-api',
 								    "use_official_api": False,
 								    "require_api_key": False,
 								    "results": 'HTML',
 								}
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
+								# engine dependent config
-												[enh] add more categories

											
										
										
											2021-12-22 15:58:52 +00:00
+								categories = ['general', 'web']
-												[enh] bing, google paging support

											
										
										
											2014-01-29 20:14:38 +00:00
+								paging = True
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
+								time_range_support = False
 								safesearch = False
-												[mod] add 'Accept-Language' HTTP header to online processores

Most engines that support languages (and regions) use the Accept-Language from
the WEB browser to build a response that fits to the language (and region).

- add new engine option: send_accept_language_header

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-01 15:01:59 +00:00
+								send_accept_language_header = True
-												[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.

											
										
										
											2016-11-06 02:51:38 +00:00
+								supported_languages_url = 'https://www.bing.com/account/general'
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 15:47:47 +00:00
+								language_aliases = {}
-												[enh] bing, google paging support

											
										
										
											2014-01-29 20:14:38 +00:00
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
+								# search-url
 								base_url = 'https://www.bing.com/'
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
 								# initial query:     https://www.bing.com/search?q=foo&search=&form=QBLH
 								inital_query = 'search?{query}&search=&form=QBLH'
 								# following queries: https://www.bing.com/search?q=foo&search=&first=11&FORM=PERE
 								page_query = 'search?{query}&search=&first={offset}&FORM=PERE'
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								def _get_offset_from_pageno(pageno):
 								    return (pageno - 1) * 10 + 1
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
+								def request(query, params):
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
+								    offset = _get_offset_from_pageno(params.get('pageno', 1))
 								    # logger.debug("params['pageno'] --> %s", params.get('pageno'))
 								    # logger.debug("          offset --> %s", offset)
 								    search_string = page_query
 								    if offset == 1:
 								        search_string = inital_query
-												Revert "remove 'all' option from search languages"

This reverts commit 4d1770398a6af8902e75c0bd885781584d39e796.

											
										
										
											2019-01-06 14:27:46 +00:00
+								    if params['language'] == 'all':
 								        lang = 'EN'
 								    else:
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								        lang = match_language(params['language'], supported_languages, language_aliases)
-												[fix] use english as default language in bing

If no language is specified, bing returns results with multiple languages
for one query which isn't really useful. Setting english as default
insted if nothing.

											
										
										
											2016-12-30 17:17:14 +00:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								    query = 'language:{} {}'.format(lang.split('-')[0].upper(), query)
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								    search_path = search_string.format(query=urlencode({'q': query}), offset=offset)
-												fix bing "garbage" results (issue #1275)

											
										
										
											2018-05-20 23:10:22 +00:00
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
+								    if offset > 1:
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								        referer = base_url + inital_query.format(query=urlencode({'q': query}))
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
+								        params['headers']['Referer'] = referer
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								        logger.debug("headers.Referer --> %s", referer)
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
+								    params['url'] = base_url + search_path
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								    params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
-												[fix] bing engine: fix paging support, show inital page.

Follow up queries for the pages needed to be fixed.

- Split search-term in one for initial query and one for following queries.
- Set some headers in HTTP requests, bing needs for paging support.
- IMO //div[@class="sa_cc"] does no longer match in a bing response.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 12:41:12 +00:00
+								    return params
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
+								def response(resp):
 								    results = []
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								    result_len = 0
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
-												[fix] bing unicode encode error - fixes #408

											
										
										
											2015-08-28 12:51:32 +00:00
+								    dom = html.fromstring(resp.text)
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
+								    # parse results again if nothing is found yet
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 16:24:47 +00:00
 								    url_to_resolve = []
 								    url_to_resolve_index = []
-												[fix] bing: parsing result; check to see if the element contains links

This patch is to hardening the parsing of the bing response:

1. To fix [2087] check if the selected result item contains a link, otherwise
   skip result item and continue in the result loop.  Increment the result
   pointer when a result has been added / the enumerate that counts for skipped
   items is no longer valid when result items are skipped.

   To test the bugfix use:   ``!bi :all cerbot``

2. Limit the XPath selection of result items to direct children nodes (list
   items ``li``) of the ordered list (``ol``).

   To test the selector use: ``!bi :en pontiac aztek wiki``

   .. in the result list you should find the wikipedia entry on top,
   compare [2068]

[2087] https://github.com/searxng/searxng/issues/2087
[2068] https://github.com/searxng/searxng/issues/2068

											
										
										
											2023-01-08 18:12:52 +00:00
+								    i = 0
 								    for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
-												[fix] bing: parsing result; check to see if the element contains links

This patch is to hardening the parsing of the bing response:

1. To fix [2087] check if the selected result item contains a link, otherwise
   skip result item and continue in the result loop.  Increment the result
   pointer when a result has been added / the enumerate that counts for skipped
   items is no longer valid when result items are skipped.

   To test the bugfix use:   ``!bi :all cerbot``

2. Limit the XPath selection of result items to direct children nodes (list
   items ``li``) of the ordered list (``ol``).

   To test the selector use: ``!bi :en pontiac aztek wiki``

   .. in the result list you should find the wikipedia entry on top,
   compare [2068]

[2087] https://github.com/searxng/searxng/issues/2087
[2068] https://github.com/searxng/searxng/issues/2068

											
										
										
											2023-01-08 18:12:52 +00:00
+								        link = eval_xpath_getindex(result, './/h2/a', 0, None)
 								        if link is None:
 								            continue
-												[enh] bing updates ++ language support

											
										
										
											2013-10-24 23:37:48 +00:00
+								        url = link.attrib.get('href')
-												Add bing in the test units

											
										
										
											2015-01-25 19:14:37 +00:00
+								        title = extract_text(link)
-												[fix] Bing-Web engine: XPath to get the wikipedia result

Modify the XPath selector to get the wikipedia result plus small fixes.

About result content: especially with the Wikipedia result, we'd get several
paragraph elements, only the first paragraph would be taken and displayed on the
search result

											
										
										
											2023-01-03 21:59:01 +00:00
 								        # Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
 								        content = eval_xpath(result, '(.//p)[1]')
 								        for p in content:
 								            for e in p.xpath('.//a'):
 								                e.getparent().remove(e)
 								            for e in p.xpath('.//span[@class="algoSlug_icon"]'):
 								                e.getparent().remove(e)
 								        content = extract_text(content)
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 16:24:47 +00:00
+								        # get the real URL either using the URL shown to user or following the Bing URL
 								        if url.startswith('https://www.bing.com/ck/a?'):
 								            url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
 								            # Bing can shorten the URL either at the end or in the middle of the string
 								            if (
 								                url_cite.startswith('https://')
 								                and '…' not in url_cite
 								                and '...' not in url_cite
 								                and '›' not in url_cite
 								            ):
 								                # no need for an additional HTTP request
 								                url = url_cite
 								            else:
 								                # resolve the URL with an additional HTTP request
 								                url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
 								                url_to_resolve_index.append(i)
 								                url = None  # remove the result if the HTTP Bing redirect raise an exception
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
+								        # append result
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								        results.append({'url': url, 'title': title, 'content': content})
-												[fix] bing: parsing result; check to see if the element contains links

This patch is to hardening the parsing of the bing response:

1. To fix [2087] check if the selected result item contains a link, otherwise
   skip result item and continue in the result loop.  Increment the result
   pointer when a result has been added / the enumerate that counts for skipped
   items is no longer valid when result items are skipped.

   To test the bugfix use:   ``!bi :all cerbot``

2. Limit the XPath selection of result items to direct children nodes (list
   items ``li``) of the ordered list (``ol``).

   To test the selector use: ``!bi :en pontiac aztek wiki``

   .. in the result list you should find the wikipedia entry on top,
   compare [2068]

[2087] https://github.com/searxng/searxng/issues/2087
[2068] https://github.com/searxng/searxng/issues/2068

											
										
										
											2023-01-08 18:12:52 +00:00
+								        # increment result pointer for the next iteration in this loop
 								        i += 1
-												update bing engines and fix bing_news

											
										
										
											2014-09-01 12:38:59 +00:00
-												bing.py: resolve bing.com/ck/a redirections

add a new function searx.network.multi_requests to send multiple HTTP requests at once

											
										
										
											2022-05-21 16:24:47 +00:00
+								    # resolve all Bing redirections in parallel
 								    request_list = [
 								        Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
 								    ]
 								    response_list = multi_requests(request_list)
 								    for i, redirect_response in enumerate(response_list):
 								        if not isinstance(redirect_response, Exception):
 								            results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
 								    # get number_of_results
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								    try:
-												[fix] handle missing result size

											
										
										
											2020-01-02 21:28:47 +00:00
+								        result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								        if "-" in result_len_container:
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								            # Remove the part "from-to" for paginated request ...
-												[format.python] initial formatting of the python code

This patch was generated by black [1]::

    make format.python

[1] https://github.com/psf/black

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-27 08:26:22 +00:00
+								            result_len_container = result_len_container[result_len_container.find("-") * 2 + 2 :]
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
 								        result_len_container = re.sub('[^0-9]', '', result_len_container)
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								        if len(result_len_container) > 0:
 								            result_len = int(result_len_container)
-												[pylint] Bing (Web) engine

Fix remarks from pylint and improved code-style.  In preparation for a bug-fix
of the Bing (Web) engine I add this engine to the pylint-list.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-12-18 10:40:12 +00:00
 								    except Exception as e:  # pylint: disable=broad-except
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								        logger.debug('result error :\n%s', e)
-												[fix] handle missing result size

											
										
										
											2020-01-02 21:28:47 +00:00
+								    if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len:
-												Fix bing engine results count (#1387)

This PR fixes the result count from bing which was throwing an (hidden) error and add a validation to avoid reading more results than avalaible.

For example :
If there is 100 results from some search and we try to get results from 120 to 130, Bing will send back the results from 0 to 10 and no error. If we compare results count with the first parameter of the request we can avoid this "invalid" results.
											
										
										
											2019-08-05 14:15:40 +00:00
+								        return []
 								    results.append({'number_of_results': result_len})
-												[enh] bing engine added

											
										
										
											2013-10-24 21:52:57 +00:00
+								    return results
-												[mod] fetch supported languages for several engines
utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.

											
										
										
											2016-11-06 02:51:38 +00:00
 								# get supported languages from their site
-												tests for _fetch_supported_languages in engines
and refactor method to make it testable without making requests

											
										
										
											2016-12-15 06:34:43 +00:00
+								def _fetch_supported_languages(resp):
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 15:47:47 +00:00
-												bugfix: fetch_supported_languages bing, -news, -videos, -images

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2020-03-01 07:01:36 +00:00
+								    lang_tags = set()
-												tests for _fetch_supported_languages in engines
and refactor method to make it testable without making requests

											
										
										
											2016-12-15 06:34:43 +00:00
+								    dom = html.fromstring(resp.text)
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 15:47:47 +00:00
+								    lang_links = eval_xpath(dom, '//div[@id="language-section"]//li')
 								    for _li in lang_links:
 								        href = eval_xpath(_li, './/@href')[0]
 								        (_scheme, _netloc, _path, _params, query, _fragment) = urlparse(href)
 								        query = parse_qs(query, keep_blank_values=True)
 								        # fmt: off
 								        setlang = query.get('setlang', [None, ])[0]
 								        # example: 'mn-Cyrl-MN' --> '['mn', 'Cyrl-MN']
 								        lang, nation = (setlang.split('-', maxsplit=1) + [None,])[:2]  # fmt: skip
 								        # fmt: on
-												bing engine: _fetch_supported_languages: don't use the language code as a country

ref #1029

											
										
										
											2022-03-31 20:03:34 +00:00
+								        tag = lang + '-' + nation if nation else lang
-												[fix] bing engines: fetch_supported_languages

The Request to and the Response from https://www.bing.com/account/general has
been changed.

[1] https://github.com/searxng/searxng/pull/672#discussion_r777104919

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-01-01 15:47:47 +00:00
+								        lang_tags.add(tag)
-												bugfix: fetch_supported_languages bing, -news, -videos, -images

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2020-03-01 07:01:36 +00:00
 								    return list(lang_tags)
-												[mod] bing: fetch engine traits (data_type: supported_languages)

Implements a fetch_traits function for the Bing engines.

.. note::

   Does not include migration of the request methode from 'supported_languages'
   to 'traits' (EngineTraits) object!

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-10-03 08:06:17 +00:00
 								def fetch_traits(engine_traits: EngineTraits):
 								    """Fetch languages and regions from bing."""
 								    # pylint: disable=import-outside-toplevel, disable=too-many-branches,
 								    # pylint: disable=too-many-locals, too-many-statements
 								    engine_traits.data_type = 'supported_languages'  # deprecated
 								    import babel
 								    import babel.languages
 								    from searx import network
 								    from searx.locales import get_offical_locales, language_tag, region_tag
 								    from searx.utils import gen_useragent
 								    headers = {
 								        'User-Agent': gen_useragent(),
 								        'Accept-Language': "en-US,en;q=0.5",  # bing needs to set the English language
 								    }
 								    resp = network.get('https://www.bing.com/account/general', headers=headers)
 								    if not resp.ok:
 								        print("ERROR: response from peertube is not OK.")
 								    dom = html.fromstring(resp.text)
 								    # Selector to get items from "Display language"
 								    lang_map = {
 								        'prs': 'fa',  # Persian
 								        'pt_BR': 'pt',  # Portuguese (Brasil)
 								        'pt_PT': 'pt',  # Portuguese (Portugal)
 								        'ca-ES-VALENCIA': 'ca',  # Catalan (Spain, Valencian)
 								    }
 								    unknow_langs = [
 								        'quc',  # K'iche'
 								        'nso',  # Sesotho sa Leboa
 								        'tn',  # Setswana
 								    ]
 								    for div in eval_xpath(dom, '//div[@id="limit-languages"]//input/..'):
 								        eng_lang = eval_xpath(div, './/input/@value')[0]
 								        if eng_lang in unknow_langs:
 								            continue
 								        eng_lang = lang_map.get(eng_lang, eng_lang)
 								        label = extract_text(eval_xpath(div, './/label'))
 								        # The 'language:xx' query string in the request function (above) does
 								        # only support the language codes from the "Display languages" list.
 								        # Examples of items from the "Display languages" not sopported in the
 								        # query string: zh_Hans --> zh / sr_latn --> sr
 								        #
 								        # eng_lang = eng_lang.split('_')[0]
 								        try:
 								            sxng_tag = language_tag(babel.Locale.parse(eng_lang.replace('-', '_'), sep='_'))
 								        except babel.UnknownLocaleError:
 								            print("ERROR: %s (%s) is unknown by babel" % (label, eng_lang))
 								            continue
 								        conflict = engine_traits.languages.get(sxng_tag)
 								        if conflict:
 								            if conflict != eng_lang:
 								                print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, eng_lang))
 								            continue
 								        engine_traits.languages[sxng_tag] = eng_lang
 								    engine_traits.languages['zh'] = 'zh_Hans'
 								    # regiones
 								    for a in eval_xpath(dom, '//div[@id="region-section-content"]//li/a'):
 								        href = eval_xpath(a, './/@href')[0]
 								        # lang_name = extract_text(a)
 								        query = urlparse(href)[4]
 								        query = parse_qs(query, keep_blank_values=True)
 								        cc = query.get('cc')[0]  # pylint:disable=invalid-name
 								        if cc == 'clear':
 								            continue
 								        # Assert babel supports this locales
 								        sxng_locales = get_offical_locales(cc.upper(), engine_traits.languages.keys())
 								        if not sxng_locales:
 								            # print("ERROR: can't map from bing country %s (%s) to a babel region." % (a.text_content().strip(), cc))
 								            continue
 								        for sxng_locale in sxng_locales:
 								            engine_traits.regions[region_tag(sxng_locale)] = cc