mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[mod] presearch: add language & region support
In Presearch there are languages for the UI and regions for narrowing down the
search.  With this change the SearXNG engine supports a search by region.  The
details can be found in the documentation of the source code.
To test, you can search terms like::
   !presearch bmw :zh-TW
   !presearch bmw :en-CA
1. You should get results corresponding to the region (Taiwan, Canada)
2. and in the language (Chinese, Englisch).
3. The context in info box content is in the same language.
Exceptions:
1. Region or language is not supported by Presearch or
2. SearXNG user did not selected a region tag, example::
    !presearch bmw :en
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
			
			
This commit is contained in:
		
							parent
							
								
									a2c269bbac
								
							
						
					
					
						commit
						e560d7e373
					
				
					 2 changed files with 113 additions and 17 deletions
				
			
		
							
								
								
									
										13
									
								
								docs/dev/engines/online/presearch.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								docs/dev/engines/online/presearch.rst
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,13 @@
 | 
			
		|||
.. _engine presearch:
 | 
			
		||||
 | 
			
		||||
================
 | 
			
		||||
Presearch Engine
 | 
			
		||||
================
 | 
			
		||||
 | 
			
		||||
.. contents::
 | 
			
		||||
   :depth: 2
 | 
			
		||||
   :local:
 | 
			
		||||
   :backlinks: entry
 | 
			
		||||
 | 
			
		||||
.. automodule:: searx.engines.presearch
 | 
			
		||||
   :members:
 | 
			
		||||
| 
						 | 
				
			
			@ -1,23 +1,72 @@
 | 
			
		|||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
			
		||||
# lint: pylint
 | 
			
		||||
"""Presearch (general, images, videos, news)
 | 
			
		||||
"""Presearch supports the search types listed in :py:obj:`search_type` (general,
 | 
			
		||||
images, videos, news).
 | 
			
		||||
 | 
			
		||||
Configured ``presarch`` engines:
 | 
			
		||||
 | 
			
		||||
.. code:: yaml
 | 
			
		||||
 | 
			
		||||
  - name: presearch
 | 
			
		||||
    engine: presearch
 | 
			
		||||
    search_type: search
 | 
			
		||||
    categories: [general, web]
 | 
			
		||||
 | 
			
		||||
  - name: presearch images
 | 
			
		||||
    ...
 | 
			
		||||
    search_type: images
 | 
			
		||||
    categories: [images, web]
 | 
			
		||||
 | 
			
		||||
  - name: presearch videos
 | 
			
		||||
    ...
 | 
			
		||||
    search_type: videos
 | 
			
		||||
    categories: [general, web]
 | 
			
		||||
 | 
			
		||||
  - name: presearch news
 | 
			
		||||
    ...
 | 
			
		||||
    search_type: news
 | 
			
		||||
    categories: [news, web]
 | 
			
		||||
 | 
			
		||||
.. hint::
 | 
			
		||||
 | 
			
		||||
   The results in the video category are most often links to pages that contain
 | 
			
		||||
   a video, for instance many links from preasearch's video category link
 | 
			
		||||
   content from facebook (aka Meta) or Twitter (aka X).  Since these are not
 | 
			
		||||
   real links to video streams SearXNG can't use the video template for this and
 | 
			
		||||
   if SearXNG can't use this template, then the user doesn't want to see these
 | 
			
		||||
   hits in the videos category.
 | 
			
		||||
 | 
			
		||||
   TL;DR; by default presearch's video category is placed into categories::
 | 
			
		||||
   By default Presearch's video category is intentionally placed into::
 | 
			
		||||
 | 
			
		||||
       categories: [general, web]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Search type ``video``
 | 
			
		||||
=====================
 | 
			
		||||
 | 
			
		||||
The results in the video category are most often links to pages that contain a
 | 
			
		||||
video, for instance many links from Preasearch's video category link content
 | 
			
		||||
from facebook (aka Meta) or Twitter (aka X).  Since these are not real links to
 | 
			
		||||
video streams SearXNG can't use the video template for this and if SearXNG can't
 | 
			
		||||
use this template, then the user doesn't want to see these hits in the videos
 | 
			
		||||
category.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Languages & Regions
 | 
			
		||||
===================
 | 
			
		||||
 | 
			
		||||
In Presearch there are languages for the UI and regions for narrowing down the
 | 
			
		||||
search.  If we set "auto" for the region in the WEB-UI of Presearch and cookie
 | 
			
		||||
``use_local_search_results=false``, then the defaults are set for both (the
 | 
			
		||||
language and the region) from the ``Accept-Language`` header.
 | 
			
		||||
 | 
			
		||||
Since the region is already "auto" by default, we only need to set the
 | 
			
		||||
``use_local_search_results`` cookie and send the ``Accept-Language`` header.  We
 | 
			
		||||
have to set these values in both requests we send to Presearch; in the first
 | 
			
		||||
request to get the request-ID from Presearch and in the final request to get the
 | 
			
		||||
result list (see ``send_accept_language_header``).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Implementations
 | 
			
		||||
===============
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
from urllib.parse import urlencode
 | 
			
		||||
from searx import locales
 | 
			
		||||
from searx.network import get
 | 
			
		||||
from searx.utils import gen_useragent, html_to_text
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -32,6 +81,7 @@ about = {
 | 
			
		|||
paging = True
 | 
			
		||||
safesearch = True
 | 
			
		||||
time_range_support = True
 | 
			
		||||
send_accept_language_header = True
 | 
			
		||||
categories = ["general", "web"]  # general, images, videos, news
 | 
			
		||||
 | 
			
		||||
search_type = "search"
 | 
			
		||||
| 
						 | 
				
			
			@ -46,19 +96,43 @@ def init(_):
 | 
			
		|||
        raise ValueError(f'presearch search_type: {search_type}')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_request_id(query, page, time_range, safesearch_param):
 | 
			
		||||
def _get_request_id(query, params):
 | 
			
		||||
 | 
			
		||||
    args = {
 | 
			
		||||
        "q": query,
 | 
			
		||||
        "page": page,
 | 
			
		||||
        "page": params["pageno"],
 | 
			
		||||
    }
 | 
			
		||||
    if time_range:
 | 
			
		||||
        args["time"] = time_range
 | 
			
		||||
 | 
			
		||||
    if params["time_range"]:
 | 
			
		||||
        args["time"] = params["time_range"]
 | 
			
		||||
 | 
			
		||||
    url = f"{base_url}/{search_type}?{urlencode(args)}"
 | 
			
		||||
 | 
			
		||||
    headers = {
 | 
			
		||||
        'User-Agent': gen_useragent(),
 | 
			
		||||
        'Cookie': f"b=1;presearch_session=;use_safe_search={safesearch_map[safesearch_param]}",
 | 
			
		||||
        'Cookie': (
 | 
			
		||||
            f"b=1;"
 | 
			
		||||
            f" presearch_session=;"
 | 
			
		||||
            f" use_local_search_results=false;"
 | 
			
		||||
            f" use_safe_search={safesearch_map[params['safesearch']]}"
 | 
			
		||||
        ),
 | 
			
		||||
    }
 | 
			
		||||
    if params['searxng_locale'] != 'all':
 | 
			
		||||
        l = locales.get_locale(params['searxng_locale'])
 | 
			
		||||
 | 
			
		||||
        # Presearch narrows down the search by region.  In SearXNG when the user
 | 
			
		||||
        # does not set a region (e.g. 'en-CA' / canada) we cannot hand over a
 | 
			
		||||
        # region.
 | 
			
		||||
 | 
			
		||||
        # We could possibly use searx.locales.get_official_locales to determine
 | 
			
		||||
        # in which regions this language is an official one, but then we still
 | 
			
		||||
        # wouldn't know which region should be given more weight / Presearch
 | 
			
		||||
        # performs an IP-based geolocation of the user, we don't want that in
 | 
			
		||||
        # SearXNG ;-)
 | 
			
		||||
 | 
			
		||||
        if l.territory:
 | 
			
		||||
            headers['Accept-Language'] = f"{l.language}-{l.territory},{l.language};" "q=0.9,*;" "q=0.5"
 | 
			
		||||
 | 
			
		||||
    resp_text = get(url, headers=headers).text  # type: ignore
 | 
			
		||||
 | 
			
		||||
    for line in resp_text.split("\n"):
 | 
			
		||||
| 
						 | 
				
			
			@ -69,8 +143,7 @@ def _get_request_id(query, page, time_range, safesearch_param):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def request(query, params):
 | 
			
		||||
    request_id = _get_request_id(query, params["pageno"], params["time_range"], params["safesearch"])
 | 
			
		||||
 | 
			
		||||
    request_id = _get_request_id(query, params)
 | 
			
		||||
    params["headers"]["Accept"] = "application/json"
 | 
			
		||||
    params["url"] = f"{base_url}/results?id={request_id}"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -109,7 +182,17 @@ def parse_search_query(json_results):
 | 
			
		|||
    if info:
 | 
			
		||||
        attributes = []
 | 
			
		||||
        for item in info.get('about', []):
 | 
			
		||||
            label, value = html_to_text(item).split(':', 1)
 | 
			
		||||
 | 
			
		||||
            text = html_to_text(item)
 | 
			
		||||
            if ':' in text:
 | 
			
		||||
                # split text into key / value
 | 
			
		||||
                label, value = text.split(':', 1)
 | 
			
		||||
            else:
 | 
			
		||||
                # In other languages (tested with zh-TW) a colon is represented
 | 
			
		||||
                # by a different symbol --> then we split at the first space.
 | 
			
		||||
                label, value = text.split(' ', 1)
 | 
			
		||||
                label = label[:-1]
 | 
			
		||||
 | 
			
		||||
            value = _strip_leading_strings(value)
 | 
			
		||||
            attributes.append({'label': label, 'value': value})
 | 
			
		||||
        content = []
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue