mirror of https://github.com/searxng/searxng.git
[fix] Bing-Web engine: XPath to get the wikipedia result
Modify the XPath selector to get the wikipedia result plus small fixes. About result content: especially with the Wikipedia result, we'd get several paragraph elements, only the first paragraph would be taken and displayed on the search result
This commit is contained in:
parent
4e355564d2
commit
9ee99423fe
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
|
- https://github.com/searx/searx/issues/2019#issuecomment-648227442
|
||||||
"""
|
"""
|
||||||
|
# pylint: disable=too-many-branches
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlencode, urlparse, parse_qs
|
from urllib.parse import urlencode, urlparse, parse_qs
|
||||||
|
@ -74,7 +75,6 @@ def request(query, params):
|
||||||
|
|
||||||
|
|
||||||
def response(resp):
|
def response(resp):
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
result_len = 0
|
result_len = 0
|
||||||
|
|
||||||
|
@ -84,12 +84,20 @@ def response(resp):
|
||||||
|
|
||||||
url_to_resolve = []
|
url_to_resolve = []
|
||||||
url_to_resolve_index = []
|
url_to_resolve_index = []
|
||||||
for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
|
for i, result in enumerate(eval_xpath_list(dom, '//li[contains(@class, "b_algo")]')):
|
||||||
|
|
||||||
link = eval_xpath(result, './/h2/a')[0]
|
link = eval_xpath(result, './/h2/a')[0]
|
||||||
url = link.attrib.get('href')
|
url = link.attrib.get('href')
|
||||||
title = extract_text(link)
|
title = extract_text(link)
|
||||||
content = extract_text(eval_xpath(result, './/p'))
|
|
||||||
|
# Make sure that the element is free of <a href> links and <span class='algoSlug_icon'>
|
||||||
|
content = eval_xpath(result, '(.//p)[1]')
|
||||||
|
for p in content:
|
||||||
|
for e in p.xpath('.//a'):
|
||||||
|
e.getparent().remove(e)
|
||||||
|
for e in p.xpath('.//span[@class="algoSlug_icon"]'):
|
||||||
|
e.getparent().remove(e)
|
||||||
|
content = extract_text(content)
|
||||||
|
|
||||||
# get the real URL either using the URL shown to user or following the Bing URL
|
# get the real URL either using the URL shown to user or following the Bing URL
|
||||||
if url.startswith('https://www.bing.com/ck/a?'):
|
if url.startswith('https://www.bing.com/ck/a?'):
|
||||||
|
|
Loading…
Reference in New Issue