mirror of https://github.com/searxng/searxng.git
[fix] bing: parsing result; check to see if the element contains links
This patch is to hardening the parsing of the bing response: 1. To fix [2087] check if the selected result item contains a link, otherwise skip result item and continue in the result loop. Increment the result pointer when a result has been added / the enumerate that counts for skipped items is no longer valid when result items are skipped. To test the bugfix use: ``!bi :all cerbot`` 2. Limit the XPath selection of result items to direct children nodes (list items ``li``) of the ordered list (``ol``). To test the selector use: ``!bi :en pontiac aztek wiki`` .. in the result list you should find the wikipedia entry on top, compare [2068] [2087] https://github.com/searxng/searxng/issues/2087 [2068] https://github.com/searxng/searxng/issues/2068
This commit is contained in:
parent
a90ed481ed
commit
7fc8d72889
|
@ -9,7 +9,7 @@
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlencode, urlparse, parse_qs
|
from urllib.parse import urlencode, urlparse, parse_qs
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
|
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language, eval_xpath_getindex
|
||||||
from searx.network import multi_requests, Request
|
from searx.network import multi_requests, Request
|
||||||
|
|
||||||
about = {
|
about = {
|
||||||
|
@ -84,9 +84,12 @@ def response(resp):
|
||||||
|
|
||||||
url_to_resolve = []
|
url_to_resolve = []
|
||||||
url_to_resolve_index = []
|
url_to_resolve_index = []
|
||||||
for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')):
|
i = 0
|
||||||
|
for result in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
|
||||||
|
|
||||||
link = eval_xpath(result, './/h2/a')[0]
|
link = eval_xpath_getindex(result, './/h2/a', 0, None)
|
||||||
|
if link is None:
|
||||||
|
continue
|
||||||
url = link.attrib.get('href')
|
url = link.attrib.get('href')
|
||||||
title = extract_text(link)
|
title = extract_text(link)
|
||||||
|
|
||||||
|
@ -119,6 +122,8 @@ def response(resp):
|
||||||
|
|
||||||
# append result
|
# append result
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
results.append({'url': url, 'title': title, 'content': content})
|
||||||
|
# increment result pointer for the next iteration in this loop
|
||||||
|
i += 1
|
||||||
|
|
||||||
# resolve all Bing redirections in parallel
|
# resolve all Bing redirections in parallel
|
||||||
request_list = [
|
request_list = [
|
||||||
|
|
Loading…
Reference in New Issue