[fix] bing: parsing result; check to see if the element contains links

This patch is to hardening the parsing of the bing response:

1. To fix [2087] check if the selected result item contains a link, otherwise
   skip result item and continue in the result loop.  Increment the result
   pointer when a result has been added / the enumerate that counts for skipped
   items is no longer valid when result items are skipped.

   To test the bugfix use:   ``!bi :all cerbot``

2. Limit the XPath selection of result items to direct children nodes (list
   items ``li``) of the ordered list (``ol``).

   To test the selector use: ``!bi :en pontiac aztek wiki``

   .. in the result list you should find the wikipedia entry on top,
   compare [2068]

[2087] https://github.com/searxng/searxng/issues/2087
[2068] https://github.com/searxng/searxng/issues/2068
This commit is contained in:
Ahmad Alkadri 2023-01-08 19:12:52 +01:00 committed by Markus Heiser
parent a90ed481ed
commit 7fc8d72889
1 changed files with 8 additions and 3 deletions

View File

@ -9,7 +9,7 @@
import re import re
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
from searx.network import multi_requests, Request from searx.network import multi_requests, Request
about = { about = {
@ -84,9 +84,12 @@ def response(resp):
url_to_resolve = [] url_to_resolve = []
url_to_resolve_index = [] url_to_resolve_index = []
for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')): i = 0
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath(result, './/h2/a')[0] link = eval_xpath_getindex(result, './/h2/a', 0, None)
if link is None:
continue
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
@ -119,6 +122,8 @@ def response(resp):
# append result # append result
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
# increment result pointer for the next iteration in this loop
i += 1
# resolve all Bing redirections in parallel # resolve all Bing redirections in parallel
request_list = [ request_list = [