Merge pull request #1219 from dalf/follow_bing_redirect

bing.py: remove redirection links
This commit is contained in:
Alexandre Flament 2022-07-10 18:06:22 +02:00 committed by GitHub
commit 44f2eb50a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 151 additions and 55 deletions

View File

@ -8,7 +8,8 @@
import re import re
from urllib.parse import urlencode, urlparse, parse_qs from urllib.parse import urlencode, urlparse, parse_qs
from lxml import html from lxml import html
from searx.utils import eval_xpath, extract_text, match_language from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
from searx.network import multi_requests, Request
about = { about = {
"website": 'https://www.bing.com', "website": 'https://www.bing.com',
@ -79,30 +80,48 @@ def response(resp):
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
# IMO //div[@class="sa_cc"] does no longer match
logger.debug('found //div[@class="sa_cc"] --> %s', result)
link = eval_xpath(result, './/h3/a')[0]
url = link.attrib.get('href')
title = extract_text(link)
content = extract_text(eval_xpath(result, './/p'))
# append result
results.append({'url': url, 'title': title, 'content': content})
# parse results again if nothing is found yet # parse results again if nothing is found yet
for result in eval_xpath(dom, '//li[@class="b_algo"]'):
url_to_resolve = []
url_to_resolve_index = []
for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
link = eval_xpath(result, './/h2/a')[0] link = eval_xpath(result, './/h2/a')[0]
url = link.attrib.get('href') url = link.attrib.get('href')
title = extract_text(link) title = extract_text(link)
content = extract_text(eval_xpath(result, './/p')) content = extract_text(eval_xpath(result, './/p'))
# get the real URL either using the URL shown to user or following the Bing URL
if url.startswith('https://www.bing.com/ck/a?'):
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
# Bing can shorten the URL either at the end or in the middle of the string
if (
url_cite.startswith('https://')
and '' not in url_cite
and '...' not in url_cite
and '' not in url_cite
):
# no need for an additional HTTP request
url = url_cite
else:
# resolve the URL with an additional HTTP request
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
url_to_resolve_index.append(i)
url = None # remove the result if the HTTP Bing redirect raise an exception
# append result # append result
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
# resolve all Bing redirections in parallel
request_list = [
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
]
response_list = multi_requests(request_list)
for i, redirect_response in enumerate(response_list):
if not isinstance(redirect_response, Exception):
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
# get number_of_results
try: try:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
if "-" in result_len_container: if "-" in result_len_container:

View File

@ -8,7 +8,8 @@ import concurrent.futures
from queue import SimpleQueue from queue import SimpleQueue
from types import MethodType from types import MethodType
from timeit import default_timer from timeit import default_timer
from typing import Iterable, Tuple from typing import Iterable, NamedTuple, Tuple, List, Dict, Union
from contextlib import contextmanager
import httpx import httpx
import anyio import anyio
@ -48,9 +49,23 @@ def get_context_network():
return THREADLOCAL.__dict__.get('network') or get_network() return THREADLOCAL.__dict__.get('network') or get_network()
def request(method, url, **kwargs): @contextmanager
"""same as requests/requests/api.py request(...)""" def _record_http_time():
# pylint: disable=too-many-branches
time_before_request = default_timer() time_before_request = default_timer()
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
try:
yield start_time
finally:
# update total_time.
# See get_time_for_thread() and reset_time_for_thread()
if hasattr(THREADLOCAL, 'total_time'):
time_after_request = default_timer()
THREADLOCAL.total_time += time_after_request - time_before_request
def _get_timeout(start_time, kwargs):
# pylint: disable=too-many-branches
# timeout (httpx) # timeout (httpx)
if 'timeout' in kwargs: if 'timeout' in kwargs:
@ -65,45 +80,84 @@ def request(method, url, **kwargs):
# ajdust actual timeout # ajdust actual timeout
timeout += 0.2 # overhead timeout += 0.2 # overhead
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
if start_time: if start_time:
timeout -= default_timer() - start_time timeout -= default_timer() - start_time
# raise_for_error return timeout
check_for_httperror = True
if 'raise_for_httperror' in kwargs:
check_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
# requests compatibility
if isinstance(url, bytes):
url = url.decode()
# network def request(method, url, **kwargs):
network = get_context_network() """same as requests/requests/api.py request(...)"""
with _record_http_time() as start_time:
network = get_context_network()
timeout = _get_timeout(start_time, kwargs)
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
try:
return future.result(timeout)
except concurrent.futures.TimeoutError as e:
raise httpx.TimeoutException('Timeout', request=None) from e
# do request
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
try:
response = future.result(timeout)
except concurrent.futures.TimeoutError as e:
raise httpx.TimeoutException('Timeout', request=None) from e
# requests compatibility def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]:
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses """send multiple HTTP requests in parallel. Wait for all requests to finish."""
response.ok = not response.is_error with _record_http_time() as start_time:
# send the requests
network = get_context_network()
loop = get_loop()
future_list = []
for request_desc in request_list:
timeout = _get_timeout(start_time, request_desc.kwargs)
future = asyncio.run_coroutine_threadsafe(
network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop
)
future_list.append((future, timeout))
# update total_time. # read the responses
# See get_time_for_thread() and reset_time_for_thread() responses = []
if hasattr(THREADLOCAL, 'total_time'): for future, timeout in future_list:
time_after_request = default_timer() try:
THREADLOCAL.total_time += time_after_request - time_before_request responses.append(future.result(timeout))
except concurrent.futures.TimeoutError:
responses.append(httpx.TimeoutException('Timeout', request=None))
except Exception as e: # pylint: disable=broad-except
responses.append(e)
return responses
# raise an exception
if check_for_httperror:
raise_for_httperror(response)
return response class Request(NamedTuple):
"""Request description for the multi_requests function"""
method: str
url: str
kwargs: Dict[str, str] = {}
@staticmethod
def get(url, **kwargs):
return Request('GET', url, kwargs)
@staticmethod
def options(url, **kwargs):
return Request('OPTIONS', url, kwargs)
@staticmethod
def head(url, **kwargs):
return Request('HEAD', url, kwargs)
@staticmethod
def post(url, **kwargs):
return Request('POST', url, kwargs)
@staticmethod
def put(url, **kwargs):
return Request('PUT', url, kwargs)
@staticmethod
def patch(url, **kwargs):
return Request('PATCH', url, kwargs)
@staticmethod
def delete(url, **kwargs):
return Request('DELETE', url, kwargs)
def get(url, **kwargs): def get(url, **kwargs):

View File

@ -13,6 +13,7 @@ import httpx
from searx import logger, searx_debug from searx import logger, searx_debug
from .client import new_client, get_loop, AsyncHTTPTransportNoHttp from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
from .raise_for_httperror import raise_for_httperror
logger = logger.getChild('network') logger = logger.getChild('network')
@ -226,6 +227,27 @@ class Network:
kwargs['follow_redirects'] = kwargs.pop('allow_redirects') kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
return kwargs_clients return kwargs_clients
@staticmethod
def extract_do_raise_for_httperror(kwargs):
do_raise_for_httperror = True
if 'raise_for_httperror' in kwargs:
do_raise_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
return do_raise_for_httperror
@staticmethod
def patch_response(response, do_raise_for_httperror):
if isinstance(response, httpx.Response):
# requests compatibility (response is not streamed)
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
response.ok = not response.is_error
# raise an exception
if do_raise_for_httperror:
raise_for_httperror(response)
return response
def is_valid_response(self, response): def is_valid_response(self, response):
# pylint: disable=too-many-boolean-expressions # pylint: disable=too-many-boolean-expressions
if ( if (
@ -239,6 +261,7 @@ class Network:
async def call_client(self, stream, method, url, **kwargs): async def call_client(self, stream, method, url, **kwargs):
retries = self.retries retries = self.retries
was_disconnected = False was_disconnected = False
do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs)
kwargs_clients = Network.extract_kwargs_clients(kwargs) kwargs_clients = Network.extract_kwargs_clients(kwargs)
while retries >= 0: # pragma: no cover while retries >= 0: # pragma: no cover
client = await self.get_client(**kwargs_clients) client = await self.get_client(**kwargs_clients)
@ -248,7 +271,7 @@ class Network:
else: else:
response = await client.request(method, url, **kwargs) response = await client.request(method, url, **kwargs)
if self.is_valid_response(response) or retries <= 0: if self.is_valid_response(response) or retries <= 0:
return response return Network.patch_response(response, do_raise_for_httperror)
except httpx.RemoteProtocolError as e: except httpx.RemoteProtocolError as e:
if not was_disconnected: if not was_disconnected:
# the server has closed the connection: # the server has closed the connection:

View File

@ -141,28 +141,28 @@ class TestNetworkRequestRetries(SearxTestCase):
async def test_retries_ok(self): async def test_retries_ok(self):
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
network = Network(enable_http=True, retries=1, retry_on_http_error=403) network = Network(enable_http=True, retries=1, retry_on_http_error=403)
response = await network.request('GET', 'https://example.com/') response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
await network.aclose() await network.aclose()
async def test_retries_fail_int(self): async def test_retries_fail_int(self):
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
network = Network(enable_http=True, retries=0, retry_on_http_error=403) network = Network(enable_http=True, retries=0, retry_on_http_error=403)
response = await network.request('GET', 'https://example.com/') response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
self.assertEqual(response.status_code, 403) self.assertEqual(response.status_code, 403)
await network.aclose() await network.aclose()
async def test_retries_fail_list(self): async def test_retries_fail_list(self):
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429]) network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429])
response = await network.request('GET', 'https://example.com/') response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
self.assertEqual(response.status_code, 403) self.assertEqual(response.status_code, 403)
await network.aclose() await network.aclose()
async def test_retries_fail_bool(self): async def test_retries_fail_bool(self):
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
network = Network(enable_http=True, retries=0, retry_on_http_error=True) network = Network(enable_http=True, retries=0, retry_on_http_error=True)
response = await network.request('GET', 'https://example.com/') response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
self.assertEqual(response.status_code, 403) self.assertEqual(response.status_code, 403)
await network.aclose() await network.aclose()
@ -178,7 +178,7 @@ class TestNetworkRequestRetries(SearxTestCase):
with patch.object(httpx.AsyncClient, 'request', new=get_response): with patch.object(httpx.AsyncClient, 'request', new=get_response):
network = Network(enable_http=True, retries=2) network = Network(enable_http=True, retries=2)
response = await network.request('GET', 'https://example.com/') response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
await network.aclose() await network.aclose()
@ -190,7 +190,7 @@ class TestNetworkRequestRetries(SearxTestCase):
with patch.object(httpx.AsyncClient, 'request', new=get_response): with patch.object(httpx.AsyncClient, 'request', new=get_response):
network = Network(enable_http=True, retries=0) network = Network(enable_http=True, retries=0)
with self.assertRaises(httpx.RequestError): with self.assertRaises(httpx.RequestError):
await network.request('GET', 'https://example.com/') await network.request('GET', 'https://example.com/', raise_for_httperror=False)
await network.aclose() await network.aclose()
@ -237,6 +237,6 @@ class TestNetworkStreamRetries(SearxTestCase):
with patch.object(httpx.AsyncClient, 'stream', new=stream): with patch.object(httpx.AsyncClient, 'stream', new=stream):
network = Network(enable_http=True, retries=0, retry_on_http_error=403) network = Network(enable_http=True, retries=0, retry_on_http_error=403)
response = await network.stream('GET', 'https://example.com/') response = await network.stream('GET', 'https://example.com/', raise_for_httperror=False)
self.assertEqual(response.status_code, 403) self.assertEqual(response.status_code, 403)
await network.aclose() await network.aclose()