forked from zaclys/searxng
		
	[mod] implement brave (WEB) engine to replace XPath configuration
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									d151497db3
								
							
						
					
					
						commit
						460bbe5b81
					
				
					 3 changed files with 263 additions and 68 deletions
				
			
		
							
								
								
									
										13
									
								
								docs/dev/engines/online/brave.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								docs/dev/engines/online/brave.rst
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,13 @@
 | 
			
		|||
.. _brave engine:
 | 
			
		||||
 | 
			
		||||
=============
 | 
			
		||||
Brave Engines
 | 
			
		||||
=============
 | 
			
		||||
 | 
			
		||||
.. contents:: Contents
 | 
			
		||||
   :depth: 2
 | 
			
		||||
   :local:
 | 
			
		||||
   :backlinks: entry
 | 
			
		||||
 | 
			
		||||
.. automodule:: searx.engines.brave
 | 
			
		||||
  :members:
 | 
			
		||||
| 
						 | 
				
			
			@ -1,10 +1,56 @@
 | 
			
		|||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
			
		||||
"""
 | 
			
		||||
 Brave (General, news, videos, images)
 | 
			
		||||
"""
 | 
			
		||||
# lint: pylint
 | 
			
		||||
"""Brave supports the categories listed in :py:obj:`brave_category` (General,
 | 
			
		||||
news, videos, images).  The support of :py:obj:`paging` and :py:obj:`time range
 | 
			
		||||
<time_range_support>` is limited (see remarks).
 | 
			
		||||
 | 
			
		||||
Configured ``brave`` engines:
 | 
			
		||||
 | 
			
		||||
.. code:: yaml
 | 
			
		||||
 | 
			
		||||
  - name: brave
 | 
			
		||||
    engine: brave
 | 
			
		||||
    ...
 | 
			
		||||
    brave_category: search
 | 
			
		||||
    time_range_support: true
 | 
			
		||||
    paging: true
 | 
			
		||||
 | 
			
		||||
  - name: brave.images
 | 
			
		||||
    engine: brave
 | 
			
		||||
    ...
 | 
			
		||||
    brave_category: images
 | 
			
		||||
 | 
			
		||||
  - name: brave.videos
 | 
			
		||||
    engine: brave
 | 
			
		||||
    ...
 | 
			
		||||
    brave_category: videos
 | 
			
		||||
 | 
			
		||||
  - name: brave.news
 | 
			
		||||
    engine: brave
 | 
			
		||||
    ...
 | 
			
		||||
    brave_category: news
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Implementations
 | 
			
		||||
===============
 | 
			
		||||
 | 
			
		||||
"""
 | 
			
		||||
# pylint: disable=fixme
 | 
			
		||||
 | 
			
		||||
from urllib.parse import (
 | 
			
		||||
    urlencode,
 | 
			
		||||
    urlparse,
 | 
			
		||||
    parse_qs,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
from urllib.parse import urlencode
 | 
			
		||||
import chompjs
 | 
			
		||||
from lxml import html
 | 
			
		||||
 | 
			
		||||
from searx.utils import (
 | 
			
		||||
    extract_text,
 | 
			
		||||
    eval_xpath_list,
 | 
			
		||||
    eval_xpath_getindex,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
about = {
 | 
			
		||||
    "website": 'https://search.brave.com/',
 | 
			
		||||
| 
						 | 
				
			
			@ -14,41 +60,87 @@ about = {
 | 
			
		|||
    "require_api_key": False,
 | 
			
		||||
    "results": 'HTML',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
base_url = "https://search.brave.com/"
 | 
			
		||||
categories = []
 | 
			
		||||
brave_category = 'search'
 | 
			
		||||
"""Brave supports common web-search, video search, image and video search.
 | 
			
		||||
 | 
			
		||||
- ``search``: Common WEB search
 | 
			
		||||
- ``videos``: search for videos
 | 
			
		||||
- ``images``: search for images
 | 
			
		||||
- ``news``: search for news
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
brave_spellcheck = False
 | 
			
		||||
"""Brave supports some kind of spell checking.  When activated, Brave tries to
 | 
			
		||||
fix typos, e.g. it searches for ``food`` when the user queries for ``fooh``.  In
 | 
			
		||||
the UI of Brave the user gets warned about this, since we can not warn the user
 | 
			
		||||
in SearXNG, the spellchecking is disabled by default.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
send_accept_language_header = True
 | 
			
		||||
paging = False
 | 
			
		||||
categories = ['images', 'videos', 'news']  # images, videos, news
 | 
			
		||||
"""Brave only supports paging in :py:obj:`brave_category` ``search`` (UI
 | 
			
		||||
category All)."""
 | 
			
		||||
 | 
			
		||||
safesearch = True
 | 
			
		||||
safesearch_map = {2: 'strict', 1: 'moderate', 0: 'off'}  # cookie: safesearch=off
 | 
			
		||||
 | 
			
		||||
time_range_support = False
 | 
			
		||||
"""Brave only supports time-range in :py:obj:`brave_category` ``search`` (UI
 | 
			
		||||
category All)."""
 | 
			
		||||
 | 
			
		||||
time_range_map = {
 | 
			
		||||
    'day': 'pd',
 | 
			
		||||
    'week': 'pw',
 | 
			
		||||
    'month': 'pm',
 | 
			
		||||
    'year': 'py',
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def request(query, params):
 | 
			
		||||
 | 
			
		||||
    # Don't accept br encoding / see https://github.com/searxng/searxng/pull/1787
 | 
			
		||||
    params['headers']['Accept-Encoding'] = 'gzip, deflate'
 | 
			
		||||
 | 
			
		||||
    args = {
 | 
			
		||||
        'q': query,
 | 
			
		||||
        'spellcheck': 1,
 | 
			
		||||
    }
 | 
			
		||||
    params["url"] = f"{base_url}{categories[0]}?{urlencode(args)}"
 | 
			
		||||
    if brave_spellcheck:
 | 
			
		||||
        args['spellcheck'] = '1'
 | 
			
		||||
 | 
			
		||||
    if brave_category == 'search':
 | 
			
		||||
        if params.get('pageno', 1) - 1:
 | 
			
		||||
            args['offset'] = params.get('pageno', 1) - 1
 | 
			
		||||
        if time_range_map.get(params['time_range']):
 | 
			
		||||
            args['tf'] = time_range_map.get(params['time_range'])
 | 
			
		||||
 | 
			
		||||
def get_video_results(json_data):
 | 
			
		||||
    results = []
 | 
			
		||||
    params["url"] = f"{base_url}{brave_category}?{urlencode(args)}"
 | 
			
		||||
 | 
			
		||||
    for result in json_data:
 | 
			
		||||
        results.append(
 | 
			
		||||
            {
 | 
			
		||||
                'template': 'videos.html',
 | 
			
		||||
                'url': result['url'],
 | 
			
		||||
                'thumbnail_src': result['thumbnail']['src'],
 | 
			
		||||
                'img_src': result['properties']['url'],
 | 
			
		||||
                'content': result['description'],
 | 
			
		||||
                'title': result['title'],
 | 
			
		||||
                'source': result['source'],
 | 
			
		||||
                'duration': result['video']['duration'],
 | 
			
		||||
            }
 | 
			
		||||
        )
 | 
			
		||||
    # set preferences in cookie
 | 
			
		||||
    params['cookies']['safesearch'] = safesearch_map.get(params['safesearch'], 'off')
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
    # ToDo: we need a fetch_traits(..) implementation / the ui_lang of Brave are
 | 
			
		||||
    #       limited and the country handling has it quirks
 | 
			
		||||
 | 
			
		||||
    eng_locale = params.get('searxng_locale')
 | 
			
		||||
    params['cookies']['useLocation'] = '0'  # the useLocation is IP based, we use 'country'
 | 
			
		||||
    params['cookies']['summarizer'] = '0'
 | 
			
		||||
 | 
			
		||||
    if not eng_locale or eng_locale == 'all':
 | 
			
		||||
        params['cookies']['country'] = 'all'  # country=all
 | 
			
		||||
    else:
 | 
			
		||||
        params['cookies']['country'] = eng_locale.split('-')[-1].lower()
 | 
			
		||||
        params['cookies']['ui_lang'] = eng_locale.split('-')[0].lower()
 | 
			
		||||
 | 
			
		||||
    # logger.debug("cookies %s", params['cookies'])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def response(resp):
 | 
			
		||||
    results = []
 | 
			
		||||
 | 
			
		||||
    if brave_category == 'search':
 | 
			
		||||
        return _parse_search(resp)
 | 
			
		||||
 | 
			
		||||
    datastr = ""
 | 
			
		||||
    for line in resp.text.split("\n"):
 | 
			
		||||
| 
						 | 
				
			
			@ -57,10 +149,81 @@ def response(resp):
 | 
			
		|||
            break
 | 
			
		||||
 | 
			
		||||
    json_data = chompjs.parse_js_object(datastr)
 | 
			
		||||
 | 
			
		||||
    json_resp = json_data[1]['data']['body']['response']
 | 
			
		||||
    if categories[0] == 'news':
 | 
			
		||||
 | 
			
		||||
    if brave_category == 'news':
 | 
			
		||||
        json_resp = json_resp['news']
 | 
			
		||||
        return _parse_news(json_resp)
 | 
			
		||||
 | 
			
		||||
    if brave_category == 'images':
 | 
			
		||||
        return _parse_images(json_resp)
 | 
			
		||||
    if brave_category == 'videos':
 | 
			
		||||
        return _parse_videos(json_resp)
 | 
			
		||||
 | 
			
		||||
    return []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _parse_search(resp):
 | 
			
		||||
 | 
			
		||||
    result_list = []
 | 
			
		||||
    dom = html.fromstring(resp.text)
 | 
			
		||||
 | 
			
		||||
    answer_tag = eval_xpath_getindex(dom, '//div[@class="answer"]', 0, default=None)
 | 
			
		||||
    if answer_tag:
 | 
			
		||||
        result_list.append({'answer': extract_text(answer_tag)})
 | 
			
		||||
 | 
			
		||||
    # xpath_results = '//div[contains(@class, "snippet fdb") and @data-type="web"]'
 | 
			
		||||
    xpath_results = '//div[contains(@class, "snippet")]'
 | 
			
		||||
 | 
			
		||||
    for result in eval_xpath_list(dom, xpath_results):
 | 
			
		||||
 | 
			
		||||
        url = eval_xpath_getindex(result, './/a[@class="result-header"]/@href', 0, default=None)
 | 
			
		||||
        title_tag = eval_xpath_getindex(result, './/span[@class="snippet-title"]', 0, default=None)
 | 
			
		||||
        if not (url and title_tag):
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        content_tag = eval_xpath_getindex(result, './/p[@class="snippet-description"]', 0, default='')
 | 
			
		||||
        img_src = eval_xpath_getindex(result, './/img[@class="thumb"]/@src', 0, default='')
 | 
			
		||||
 | 
			
		||||
        item = {
 | 
			
		||||
            'url': url,
 | 
			
		||||
            'title': extract_text(title_tag),
 | 
			
		||||
            'content': extract_text(content_tag),
 | 
			
		||||
            'img_src': img_src,
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        video_tag = eval_xpath_getindex(
 | 
			
		||||
            result, './/div[contains(@class, "video-snippet") and @data-macro="video"]', 0, default=None
 | 
			
		||||
        )
 | 
			
		||||
        if video_tag:
 | 
			
		||||
 | 
			
		||||
            # In my tests a video tag in the WEB search was mostoften not a
 | 
			
		||||
            # video, except the ones from youtube ..
 | 
			
		||||
 | 
			
		||||
            iframe_src = _get_iframe_src(url)
 | 
			
		||||
            if iframe_src:
 | 
			
		||||
                item['iframe_src'] = iframe_src
 | 
			
		||||
                item['template'] = 'videos.html'
 | 
			
		||||
                item['thumbnail'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
 | 
			
		||||
            else:
 | 
			
		||||
                item['img_src'] = eval_xpath_getindex(video_tag, './/img/@src', 0, default='')
 | 
			
		||||
 | 
			
		||||
        result_list.append(item)
 | 
			
		||||
 | 
			
		||||
    return result_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _get_iframe_src(url):
 | 
			
		||||
    parsed_url = urlparse(url)
 | 
			
		||||
    if parsed_url.path == '/watch' and parsed_url.query:
 | 
			
		||||
        video_id = parse_qs(parsed_url.query).get('v', [])  # type: ignore
 | 
			
		||||
        if video_id:
 | 
			
		||||
            return 'https://www.youtube-nocookie.com/embed/' + video_id[0]  # type: ignore
 | 
			
		||||
    return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _parse_news(json_resp):
 | 
			
		||||
    result_list = []
 | 
			
		||||
 | 
			
		||||
    for result in json_resp["results"]:
 | 
			
		||||
        item = {
 | 
			
		||||
| 
						 | 
				
			
			@ -68,18 +231,53 @@ def response(resp):
 | 
			
		|||
            'title': result['title'],
 | 
			
		||||
            'content': result['description'],
 | 
			
		||||
        }
 | 
			
		||||
        if result['thumbnail'] != "null":
 | 
			
		||||
            item['img_src'] = result['thumbnail']['src']
 | 
			
		||||
        result_list.append(item)
 | 
			
		||||
 | 
			
		||||
    return result_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _parse_images(json_resp):
 | 
			
		||||
    result_list = []
 | 
			
		||||
 | 
			
		||||
    for result in json_resp["results"]:
 | 
			
		||||
        item = {
 | 
			
		||||
            'url': result['url'],
 | 
			
		||||
            'title': result['title'],
 | 
			
		||||
            'content': result['description'],
 | 
			
		||||
            'template': 'images.html',
 | 
			
		||||
            'img_format': result['properties']['format'],
 | 
			
		||||
            'source': result['source'],
 | 
			
		||||
            'img_src': result['properties']['url'],
 | 
			
		||||
        }
 | 
			
		||||
        result_list.append(item)
 | 
			
		||||
 | 
			
		||||
    return result_list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _parse_videos(json_resp):
 | 
			
		||||
    result_list = []
 | 
			
		||||
 | 
			
		||||
    for result in json_resp["results"]:
 | 
			
		||||
 | 
			
		||||
        url = result['url']
 | 
			
		||||
        item = {
 | 
			
		||||
            'url': url,
 | 
			
		||||
            'title': result['title'],
 | 
			
		||||
            'content': result['description'],
 | 
			
		||||
            'template': 'videos.html',
 | 
			
		||||
            'length': result['video']['duration'],
 | 
			
		||||
            'duration': result['video']['duration'],
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if result['thumbnail'] != "null":
 | 
			
		||||
            item['thumbnail'] = result['thumbnail']['src']
 | 
			
		||||
 | 
			
		||||
        if categories[0] == 'images':
 | 
			
		||||
            item['template'] = 'images.html'
 | 
			
		||||
            item['img_format'] = result['properties']['format']
 | 
			
		||||
            item['source'] = result['source']
 | 
			
		||||
            item['img_src'] = result['properties']['url']
 | 
			
		||||
        elif categories[0] == 'videos':
 | 
			
		||||
            item['template'] = 'videos.html'
 | 
			
		||||
            item['length'] = result['video']['duration']
 | 
			
		||||
        iframe_src = _get_iframe_src(url)
 | 
			
		||||
        if iframe_src:
 | 
			
		||||
            item['iframe_src'] = iframe_src
 | 
			
		||||
 | 
			
		||||
        results.append(item)
 | 
			
		||||
        result_list.append(item)
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
    return result_list
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1816,50 +1816,34 @@ engines:
 | 
			
		|||
    timeout: 9.0
 | 
			
		||||
 | 
			
		||||
  - name: brave
 | 
			
		||||
    shortcut: brave
 | 
			
		||||
    engine: xpath
 | 
			
		||||
    paging: true
 | 
			
		||||
    engine: brave
 | 
			
		||||
    shortcut: br
 | 
			
		||||
    time_range_support: true
 | 
			
		||||
    first_page_num: 0
 | 
			
		||||
    time_range_url: "&tf={time_range_val}"
 | 
			
		||||
    search_url: https://search.brave.com/search?q={query}&offset={pageno}&spellcheck=1{time_range}
 | 
			
		||||
    url_xpath: //a[@class="result-header"]/@href
 | 
			
		||||
    title_xpath: //span[@class="snippet-title"]
 | 
			
		||||
    content_xpath: //p[1][@class="snippet-description"]
 | 
			
		||||
    suggestion_xpath: //div[@class="text-gray h6"]/a
 | 
			
		||||
    time_range_map:
 | 
			
		||||
      day: 'pd'
 | 
			
		||||
      week: 'pw'
 | 
			
		||||
      month: 'pm'
 | 
			
		||||
      year: 'py'
 | 
			
		||||
    paging: true
 | 
			
		||||
    categories: [general, web]
 | 
			
		||||
    disabled: true
 | 
			
		||||
    headers:
 | 
			
		||||
      Accept-Encoding: gzip, deflate
 | 
			
		||||
    about:
 | 
			
		||||
      website: https://brave.com/search/
 | 
			
		||||
      wikidata_id: Q107355971
 | 
			
		||||
      use_official_api: false
 | 
			
		||||
      require_api_key: false
 | 
			
		||||
      results: HTML
 | 
			
		||||
    brave_category: search
 | 
			
		||||
    # brave_spellcheck: true
 | 
			
		||||
 | 
			
		||||
  - name: brave.images
 | 
			
		||||
    shortcut: braveimg
 | 
			
		||||
    engine: brave
 | 
			
		||||
    categories: images
 | 
			
		||||
    disabled: true
 | 
			
		||||
    network: brave
 | 
			
		||||
    shortcut: brimg
 | 
			
		||||
    categories: [images, web]
 | 
			
		||||
    brave_category: images
 | 
			
		||||
 | 
			
		||||
  - name: brave.videos
 | 
			
		||||
    shortcut: bravevid
 | 
			
		||||
    engine: brave
 | 
			
		||||
    categories: videos
 | 
			
		||||
    disabled: true
 | 
			
		||||
    network: brave
 | 
			
		||||
    shortcut: brvid
 | 
			
		||||
    categories: [videos, web]
 | 
			
		||||
    brave_category: videos
 | 
			
		||||
 | 
			
		||||
  - name: brave.news
 | 
			
		||||
    shortcut: bravenews
 | 
			
		||||
    engine: brave
 | 
			
		||||
    network: brave
 | 
			
		||||
    shortcut: brnews
 | 
			
		||||
    categories: news
 | 
			
		||||
    disabled: true
 | 
			
		||||
    brave_category: news
 | 
			
		||||
 | 
			
		||||
  - name: petalsearch
 | 
			
		||||
    shortcut: pts
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue