mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	Merge pull request #91 from return42/xpath-misc
[doc] add documentation about the XPath engine
This commit is contained in:
		
						commit
						703f8c4a8b
					
				
					 5 changed files with 112 additions and 29 deletions
				
			
		| 
						 | 
					@ -100,6 +100,8 @@ example code
 | 
				
			||||||
   paging = True
 | 
					   paging = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. _engine request:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
making a request
 | 
					making a request
 | 
				
			||||||
================
 | 
					================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -198,6 +200,8 @@ example code
 | 
				
			||||||
       return params
 | 
					       return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. _engine results:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
returned results
 | 
					returned results
 | 
				
			||||||
================
 | 
					================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,6 +9,7 @@ Developer documentation
 | 
				
			||||||
   quickstart
 | 
					   quickstart
 | 
				
			||||||
   contribution_guide
 | 
					   contribution_guide
 | 
				
			||||||
   engine_overview
 | 
					   engine_overview
 | 
				
			||||||
 | 
					   xpath_engine
 | 
				
			||||||
   search_api
 | 
					   search_api
 | 
				
			||||||
   plugins
 | 
					   plugins
 | 
				
			||||||
   translation
 | 
					   translation
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										9
									
								
								docs/dev/xpath_engine.rst
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								docs/dev/xpath_engine.rst
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,9 @@
 | 
				
			||||||
 | 
					.. _xpath_engine:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					================
 | 
				
			||||||
 | 
					The XPath engine
 | 
				
			||||||
 | 
					================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. automodule:: searx.engines.xpath
 | 
				
			||||||
 | 
					  :members:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,8 @@ Welcome to searxng
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    *Search without being tracked.*
 | 
					    *Search without being tracked.*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
.. warning::
 | 
					.. hint::
 | 
				
			||||||
 | 
					
 | 
				
			||||||
   This is not searx, but searxng.
 | 
					   This is not searx, but searxng.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Searxng is a free internet metasearch engine which aggregates results from more
 | 
					Searxng is a free internet metasearch engine which aggregates results from more
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,51 +1,106 @@
 | 
				
			||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
					# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
				
			||||||
 | 
					# lint: pylint
 | 
				
			||||||
 | 
					# pylint: disable=missing-function-docstring
 | 
				
			||||||
 | 
					"""The XPath engine is a *generic* engine with which it is possible to configure
 | 
				
			||||||
 | 
					engines in the settings.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Here is a simple example of a XPath engine configured in the
 | 
				
			||||||
 | 
					:ref:`settings engine` section, further read :ref:`engines-dev`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					.. code:: yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  - name : bitbucket
 | 
				
			||||||
 | 
					    engine : xpath
 | 
				
			||||||
 | 
					    paging : True
 | 
				
			||||||
 | 
					    search_url : https://bitbucket.org/repo/all/{pageno}?name={query}
 | 
				
			||||||
 | 
					    url_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]/@href
 | 
				
			||||||
 | 
					    title_xpath : //article[@class="repo-summary"]//a[@class="repo-link"]
 | 
				
			||||||
 | 
					    content_xpath : //article[@class="repo-summary"]/p
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from urllib.parse import urlencode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from lxml import html
 | 
					from lxml import html
 | 
				
			||||||
from urllib.parse import urlencode
 | 
					 | 
				
			||||||
from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
 | 
					from searx.utils import extract_text, extract_url, eval_xpath, eval_xpath_list
 | 
				
			||||||
 | 
					from searx import logger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logger = logger.getChild('XPath engine')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
search_url = None
 | 
					search_url = None
 | 
				
			||||||
url_xpath = None
 | 
					"""
 | 
				
			||||||
content_xpath = None
 | 
					Search URL of the engine, replacements are:
 | 
				
			||||||
title_xpath = None
 | 
					
 | 
				
			||||||
thumbnail_xpath = False
 | 
					``{query}``:
 | 
				
			||||||
paging = False
 | 
					  Search terms from user.
 | 
				
			||||||
suggestion_xpath = ''
 | 
					
 | 
				
			||||||
 | 
					``{pageno}``:
 | 
				
			||||||
 | 
					  Page number if engine supports pagging :py:obj:`paging`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					soft_max_redirects = 0
 | 
				
			||||||
 | 
					'''Maximum redirects, soft limit. Record an error but don't stop the engine'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
results_xpath = ''
 | 
					results_xpath = ''
 | 
				
			||||||
 | 
					'''XPath selector for the list of result items'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					url_xpath = None
 | 
				
			||||||
 | 
					'''XPath selector of result's ``url``.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					content_xpath = None
 | 
				
			||||||
 | 
					'''XPath selector of result's ``content``.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					title_xpath = None
 | 
				
			||||||
 | 
					'''XPath selector of result's ``title``.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					thumbnail_xpath = False
 | 
				
			||||||
 | 
					'''XPath selector of result's ``img_src``.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					suggestion_xpath = ''
 | 
				
			||||||
 | 
					'''XPath selector of result's ``suggestion``.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cached_xpath = ''
 | 
					cached_xpath = ''
 | 
				
			||||||
cached_url = ''
 | 
					cached_url = ''
 | 
				
			||||||
soft_max_redirects = 0
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
# parameters for engines with paging support
 | 
					paging = False
 | 
				
			||||||
#
 | 
					'''Engine supports paging [True or False].'''
 | 
				
			||||||
# number of results on each page
 | 
					
 | 
				
			||||||
# (only needed if the site requires not a page number, but an offset)
 | 
					 | 
				
			||||||
page_size = 1
 | 
					page_size = 1
 | 
				
			||||||
# number of the first page (usually 0 or 1)
 | 
					'''Number of results on each page.  Only needed if the site requires not a page
 | 
				
			||||||
first_page_num = 1
 | 
					number, but an offset.'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					first_page_num = 1
 | 
				
			||||||
 | 
					'''Number of the first page (usually 0 or 1).'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def request(query, params):
 | 
					def request(query, params):
 | 
				
			||||||
 | 
					    '''Build request parameters (see :ref:`engine request`).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
    query = urlencode({'q': query})[2:]
 | 
					    query = urlencode({'q': query})[2:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fp = {'query': query}
 | 
					    fargs = {'query': query}
 | 
				
			||||||
    if paging and search_url.find('{pageno}') >= 0:
 | 
					    if paging and search_url.find('{pageno}') >= 0:
 | 
				
			||||||
        fp['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
 | 
					        fargs['pageno'] = (params['pageno'] - 1) * page_size + first_page_num
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    params['url'] = search_url.format(**fp)
 | 
					    params['url'] = search_url.format(**fargs)
 | 
				
			||||||
    params['query'] = query
 | 
					    params['query'] = query
 | 
				
			||||||
    params['soft_max_redirects'] = soft_max_redirects
 | 
					    params['soft_max_redirects'] = soft_max_redirects
 | 
				
			||||||
 | 
					    logger.debug("query_url --> %s", params['url'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return params
 | 
					    return params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
def response(resp):
 | 
					def response(resp):
 | 
				
			||||||
 | 
					    '''Scrap *results* from the response (see :ref:`engine results`).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
    results = []
 | 
					    results = []
 | 
				
			||||||
    dom = html.fromstring(resp.text)
 | 
					    dom = html.fromstring(resp.text)
 | 
				
			||||||
    is_onion = True if 'onions' in categories else False  # pylint: disable=undefined-variable
 | 
					    is_onion = 'onions' in categories  # pylint: disable=undefined-variable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if results_xpath:
 | 
					    if results_xpath:
 | 
				
			||||||
        for result in eval_xpath_list(dom, results_xpath):
 | 
					        for result in eval_xpath_list(dom, results_xpath):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
 | 
					            url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url)
 | 
				
			||||||
            title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
 | 
					            title = extract_text(eval_xpath_list(result, title_xpath, min_len=1))
 | 
				
			||||||
            content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
 | 
					            content = extract_text(eval_xpath_list(result, content_xpath, min_len=1))
 | 
				
			||||||
| 
						 | 
					@ -59,13 +114,16 @@ def response(resp):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # add alternative cached url if available
 | 
					            # add alternative cached url if available
 | 
				
			||||||
            if cached_xpath:
 | 
					            if cached_xpath:
 | 
				
			||||||
                tmp_result['cached_url'] = cached_url\
 | 
					                tmp_result['cached_url'] = (
 | 
				
			||||||
 | 
					                    cached_url
 | 
				
			||||||
                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
 | 
					                    + extract_text(eval_xpath_list(result, cached_xpath, min_len=1))
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if is_onion:
 | 
					            if is_onion:
 | 
				
			||||||
                tmp_result['is_onion'] = True
 | 
					                tmp_result['is_onion'] = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            results.append(tmp_result)
 | 
					            results.append(tmp_result)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        if cached_xpath:
 | 
					        if cached_xpath:
 | 
				
			||||||
            for url, title, content, cached in zip(
 | 
					            for url, title, content, cached in zip(
 | 
				
			||||||
| 
						 | 
					@ -75,8 +133,12 @@ def response(resp):
 | 
				
			||||||
                map(extract_text, eval_xpath_list(dom, content_xpath)),
 | 
					                map(extract_text, eval_xpath_list(dom, content_xpath)),
 | 
				
			||||||
                map(extract_text, eval_xpath_list(dom, cached_xpath))
 | 
					                map(extract_text, eval_xpath_list(dom, cached_xpath))
 | 
				
			||||||
            ):
 | 
					            ):
 | 
				
			||||||
                results.append({'url': url, 'title': title, 'content': content,
 | 
					                results.append({
 | 
				
			||||||
                                'cached_url': cached_url + cached, 'is_onion': is_onion})
 | 
					                    'url': url,
 | 
				
			||||||
 | 
					                    'title': title,
 | 
				
			||||||
 | 
					                    'content': content,
 | 
				
			||||||
 | 
					                    'cached_url': cached_url + cached, 'is_onion': is_onion
 | 
				
			||||||
 | 
					                })
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            for url, title, content in zip(
 | 
					            for url, title, content in zip(
 | 
				
			||||||
                (extract_url(x, search_url) for
 | 
					                (extract_url(x, search_url) for
 | 
				
			||||||
| 
						 | 
					@ -84,10 +146,16 @@ def response(resp):
 | 
				
			||||||
                map(extract_text, eval_xpath_list(dom, title_xpath)),
 | 
					                map(extract_text, eval_xpath_list(dom, title_xpath)),
 | 
				
			||||||
                map(extract_text, eval_xpath_list(dom, content_xpath))
 | 
					                map(extract_text, eval_xpath_list(dom, content_xpath))
 | 
				
			||||||
            ):
 | 
					            ):
 | 
				
			||||||
                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})
 | 
					                results.append({
 | 
				
			||||||
 | 
					                    'url': url,
 | 
				
			||||||
 | 
					                    'title': title,
 | 
				
			||||||
 | 
					                    'content': content,
 | 
				
			||||||
 | 
					                    'is_onion': is_onion
 | 
				
			||||||
 | 
					                })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if not suggestion_xpath:
 | 
					    if suggestion_xpath:
 | 
				
			||||||
        return results
 | 
					 | 
				
			||||||
        for suggestion in eval_xpath(dom, suggestion_xpath):
 | 
					        for suggestion in eval_xpath(dom, suggestion_xpath):
 | 
				
			||||||
            results.append({'suggestion': extract_text(suggestion)})
 | 
					            results.append({'suggestion': extract_text(suggestion)})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    logger.debug("found %s results", len(results))
 | 
				
			||||||
    return results
 | 
					    return results
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue