forked from zaclys/searxng
Merge pull request #1219 from dalf/follow_bing_redirect
bing.py: remove redirection links
This commit is contained in:
commit
44f2eb50a5
|
@ -8,7 +8,8 @@
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlencode, urlparse, parse_qs
|
from urllib.parse import urlencode, urlparse, parse_qs
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from searx.utils import eval_xpath, extract_text, match_language
|
from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language
|
||||||
|
from searx.network import multi_requests, Request
|
||||||
|
|
||||||
about = {
|
about = {
|
||||||
"website": 'https://www.bing.com',
|
"website": 'https://www.bing.com',
|
||||||
|
@ -79,30 +80,48 @@ def response(resp):
|
||||||
|
|
||||||
dom = html.fromstring(resp.text)
|
dom = html.fromstring(resp.text)
|
||||||
|
|
||||||
for result in eval_xpath(dom, '//div[@class="sa_cc"]'):
|
|
||||||
|
|
||||||
# IMO //div[@class="sa_cc"] does no longer match
|
|
||||||
logger.debug('found //div[@class="sa_cc"] --> %s', result)
|
|
||||||
|
|
||||||
link = eval_xpath(result, './/h3/a')[0]
|
|
||||||
url = link.attrib.get('href')
|
|
||||||
title = extract_text(link)
|
|
||||||
content = extract_text(eval_xpath(result, './/p'))
|
|
||||||
|
|
||||||
# append result
|
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
|
||||||
|
|
||||||
# parse results again if nothing is found yet
|
# parse results again if nothing is found yet
|
||||||
for result in eval_xpath(dom, '//li[@class="b_algo"]'):
|
|
||||||
|
url_to_resolve = []
|
||||||
|
url_to_resolve_index = []
|
||||||
|
for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')):
|
||||||
|
|
||||||
link = eval_xpath(result, './/h2/a')[0]
|
link = eval_xpath(result, './/h2/a')[0]
|
||||||
url = link.attrib.get('href')
|
url = link.attrib.get('href')
|
||||||
title = extract_text(link)
|
title = extract_text(link)
|
||||||
content = extract_text(eval_xpath(result, './/p'))
|
content = extract_text(eval_xpath(result, './/p'))
|
||||||
|
|
||||||
|
# get the real URL either using the URL shown to user or following the Bing URL
|
||||||
|
if url.startswith('https://www.bing.com/ck/a?'):
|
||||||
|
url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite'))
|
||||||
|
# Bing can shorten the URL either at the end or in the middle of the string
|
||||||
|
if (
|
||||||
|
url_cite.startswith('https://')
|
||||||
|
and '…' not in url_cite
|
||||||
|
and '...' not in url_cite
|
||||||
|
and '›' not in url_cite
|
||||||
|
):
|
||||||
|
# no need for an additional HTTP request
|
||||||
|
url = url_cite
|
||||||
|
else:
|
||||||
|
# resolve the URL with an additional HTTP request
|
||||||
|
url_to_resolve.append(url.replace('&ntb=1', '&ntb=F'))
|
||||||
|
url_to_resolve_index.append(i)
|
||||||
|
url = None # remove the result if the HTTP Bing redirect raise an exception
|
||||||
|
|
||||||
# append result
|
# append result
|
||||||
results.append({'url': url, 'title': title, 'content': content})
|
results.append({'url': url, 'title': title, 'content': content})
|
||||||
|
|
||||||
|
# resolve all Bing redirections in parallel
|
||||||
|
request_list = [
|
||||||
|
Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve
|
||||||
|
]
|
||||||
|
response_list = multi_requests(request_list)
|
||||||
|
for i, redirect_response in enumerate(response_list):
|
||||||
|
if not isinstance(redirect_response, Exception):
|
||||||
|
results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location']
|
||||||
|
|
||||||
|
# get number_of_results
|
||||||
try:
|
try:
|
||||||
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
|
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
|
||||||
if "-" in result_len_container:
|
if "-" in result_len_container:
|
||||||
|
|
|
@ -8,7 +8,8 @@ import concurrent.futures
|
||||||
from queue import SimpleQueue
|
from queue import SimpleQueue
|
||||||
from types import MethodType
|
from types import MethodType
|
||||||
from timeit import default_timer
|
from timeit import default_timer
|
||||||
from typing import Iterable, Tuple
|
from typing import Iterable, NamedTuple, Tuple, List, Dict, Union
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import anyio
|
import anyio
|
||||||
|
@ -48,9 +49,23 @@ def get_context_network():
|
||||||
return THREADLOCAL.__dict__.get('network') or get_network()
|
return THREADLOCAL.__dict__.get('network') or get_network()
|
||||||
|
|
||||||
|
|
||||||
def request(method, url, **kwargs):
|
@contextmanager
|
||||||
"""same as requests/requests/api.py request(...)"""
|
def _record_http_time():
|
||||||
|
# pylint: disable=too-many-branches
|
||||||
time_before_request = default_timer()
|
time_before_request = default_timer()
|
||||||
|
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
|
||||||
|
try:
|
||||||
|
yield start_time
|
||||||
|
finally:
|
||||||
|
# update total_time.
|
||||||
|
# See get_time_for_thread() and reset_time_for_thread()
|
||||||
|
if hasattr(THREADLOCAL, 'total_time'):
|
||||||
|
time_after_request = default_timer()
|
||||||
|
THREADLOCAL.total_time += time_after_request - time_before_request
|
||||||
|
|
||||||
|
|
||||||
|
def _get_timeout(start_time, kwargs):
|
||||||
|
# pylint: disable=too-many-branches
|
||||||
|
|
||||||
# timeout (httpx)
|
# timeout (httpx)
|
||||||
if 'timeout' in kwargs:
|
if 'timeout' in kwargs:
|
||||||
|
@ -65,45 +80,84 @@ def request(method, url, **kwargs):
|
||||||
|
|
||||||
# ajdust actual timeout
|
# ajdust actual timeout
|
||||||
timeout += 0.2 # overhead
|
timeout += 0.2 # overhead
|
||||||
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
|
|
||||||
if start_time:
|
if start_time:
|
||||||
timeout -= default_timer() - start_time
|
timeout -= default_timer() - start_time
|
||||||
|
|
||||||
# raise_for_error
|
return timeout
|
||||||
check_for_httperror = True
|
|
||||||
if 'raise_for_httperror' in kwargs:
|
|
||||||
check_for_httperror = kwargs['raise_for_httperror']
|
|
||||||
del kwargs['raise_for_httperror']
|
|
||||||
|
|
||||||
# requests compatibility
|
|
||||||
if isinstance(url, bytes):
|
|
||||||
url = url.decode()
|
|
||||||
|
|
||||||
# network
|
def request(method, url, **kwargs):
|
||||||
network = get_context_network()
|
"""same as requests/requests/api.py request(...)"""
|
||||||
|
with _record_http_time() as start_time:
|
||||||
|
network = get_context_network()
|
||||||
|
timeout = _get_timeout(start_time, kwargs)
|
||||||
|
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
|
||||||
|
try:
|
||||||
|
return future.result(timeout)
|
||||||
|
except concurrent.futures.TimeoutError as e:
|
||||||
|
raise httpx.TimeoutException('Timeout', request=None) from e
|
||||||
|
|
||||||
# do request
|
|
||||||
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
|
|
||||||
try:
|
|
||||||
response = future.result(timeout)
|
|
||||||
except concurrent.futures.TimeoutError as e:
|
|
||||||
raise httpx.TimeoutException('Timeout', request=None) from e
|
|
||||||
|
|
||||||
# requests compatibility
|
def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]:
|
||||||
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
|
"""send multiple HTTP requests in parallel. Wait for all requests to finish."""
|
||||||
response.ok = not response.is_error
|
with _record_http_time() as start_time:
|
||||||
|
# send the requests
|
||||||
|
network = get_context_network()
|
||||||
|
loop = get_loop()
|
||||||
|
future_list = []
|
||||||
|
for request_desc in request_list:
|
||||||
|
timeout = _get_timeout(start_time, request_desc.kwargs)
|
||||||
|
future = asyncio.run_coroutine_threadsafe(
|
||||||
|
network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop
|
||||||
|
)
|
||||||
|
future_list.append((future, timeout))
|
||||||
|
|
||||||
# update total_time.
|
# read the responses
|
||||||
# See get_time_for_thread() and reset_time_for_thread()
|
responses = []
|
||||||
if hasattr(THREADLOCAL, 'total_time'):
|
for future, timeout in future_list:
|
||||||
time_after_request = default_timer()
|
try:
|
||||||
THREADLOCAL.total_time += time_after_request - time_before_request
|
responses.append(future.result(timeout))
|
||||||
|
except concurrent.futures.TimeoutError:
|
||||||
|
responses.append(httpx.TimeoutException('Timeout', request=None))
|
||||||
|
except Exception as e: # pylint: disable=broad-except
|
||||||
|
responses.append(e)
|
||||||
|
return responses
|
||||||
|
|
||||||
# raise an exception
|
|
||||||
if check_for_httperror:
|
|
||||||
raise_for_httperror(response)
|
|
||||||
|
|
||||||
return response
|
class Request(NamedTuple):
|
||||||
|
"""Request description for the multi_requests function"""
|
||||||
|
|
||||||
|
method: str
|
||||||
|
url: str
|
||||||
|
kwargs: Dict[str, str] = {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get(url, **kwargs):
|
||||||
|
return Request('GET', url, kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def options(url, **kwargs):
|
||||||
|
return Request('OPTIONS', url, kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def head(url, **kwargs):
|
||||||
|
return Request('HEAD', url, kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def post(url, **kwargs):
|
||||||
|
return Request('POST', url, kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def put(url, **kwargs):
|
||||||
|
return Request('PUT', url, kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def patch(url, **kwargs):
|
||||||
|
return Request('PATCH', url, kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def delete(url, **kwargs):
|
||||||
|
return Request('DELETE', url, kwargs)
|
||||||
|
|
||||||
|
|
||||||
def get(url, **kwargs):
|
def get(url, **kwargs):
|
||||||
|
|
|
@ -13,6 +13,7 @@ import httpx
|
||||||
|
|
||||||
from searx import logger, searx_debug
|
from searx import logger, searx_debug
|
||||||
from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
|
from .client import new_client, get_loop, AsyncHTTPTransportNoHttp
|
||||||
|
from .raise_for_httperror import raise_for_httperror
|
||||||
|
|
||||||
|
|
||||||
logger = logger.getChild('network')
|
logger = logger.getChild('network')
|
||||||
|
@ -226,6 +227,27 @@ class Network:
|
||||||
kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
|
kwargs['follow_redirects'] = kwargs.pop('allow_redirects')
|
||||||
return kwargs_clients
|
return kwargs_clients
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def extract_do_raise_for_httperror(kwargs):
|
||||||
|
do_raise_for_httperror = True
|
||||||
|
if 'raise_for_httperror' in kwargs:
|
||||||
|
do_raise_for_httperror = kwargs['raise_for_httperror']
|
||||||
|
del kwargs['raise_for_httperror']
|
||||||
|
return do_raise_for_httperror
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def patch_response(response, do_raise_for_httperror):
|
||||||
|
if isinstance(response, httpx.Response):
|
||||||
|
# requests compatibility (response is not streamed)
|
||||||
|
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
|
||||||
|
response.ok = not response.is_error
|
||||||
|
|
||||||
|
# raise an exception
|
||||||
|
if do_raise_for_httperror:
|
||||||
|
raise_for_httperror(response)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
def is_valid_response(self, response):
|
def is_valid_response(self, response):
|
||||||
# pylint: disable=too-many-boolean-expressions
|
# pylint: disable=too-many-boolean-expressions
|
||||||
if (
|
if (
|
||||||
|
@ -239,6 +261,7 @@ class Network:
|
||||||
async def call_client(self, stream, method, url, **kwargs):
|
async def call_client(self, stream, method, url, **kwargs):
|
||||||
retries = self.retries
|
retries = self.retries
|
||||||
was_disconnected = False
|
was_disconnected = False
|
||||||
|
do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs)
|
||||||
kwargs_clients = Network.extract_kwargs_clients(kwargs)
|
kwargs_clients = Network.extract_kwargs_clients(kwargs)
|
||||||
while retries >= 0: # pragma: no cover
|
while retries >= 0: # pragma: no cover
|
||||||
client = await self.get_client(**kwargs_clients)
|
client = await self.get_client(**kwargs_clients)
|
||||||
|
@ -248,7 +271,7 @@ class Network:
|
||||||
else:
|
else:
|
||||||
response = await client.request(method, url, **kwargs)
|
response = await client.request(method, url, **kwargs)
|
||||||
if self.is_valid_response(response) or retries <= 0:
|
if self.is_valid_response(response) or retries <= 0:
|
||||||
return response
|
return Network.patch_response(response, do_raise_for_httperror)
|
||||||
except httpx.RemoteProtocolError as e:
|
except httpx.RemoteProtocolError as e:
|
||||||
if not was_disconnected:
|
if not was_disconnected:
|
||||||
# the server has closed the connection:
|
# the server has closed the connection:
|
||||||
|
|
|
@ -141,28 +141,28 @@ class TestNetworkRequestRetries(SearxTestCase):
|
||||||
async def test_retries_ok(self):
|
async def test_retries_ok(self):
|
||||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||||
network = Network(enable_http=True, retries=1, retry_on_http_error=403)
|
network = Network(enable_http=True, retries=1, retry_on_http_error=403)
|
||||||
response = await network.request('GET', 'https://example.com/')
|
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
|
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
|
||||||
async def test_retries_fail_int(self):
|
async def test_retries_fail_int(self):
|
||||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||||
network = Network(enable_http=True, retries=0, retry_on_http_error=403)
|
network = Network(enable_http=True, retries=0, retry_on_http_error=403)
|
||||||
response = await network.request('GET', 'https://example.com/')
|
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
self.assertEqual(response.status_code, 403)
|
self.assertEqual(response.status_code, 403)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
|
||||||
async def test_retries_fail_list(self):
|
async def test_retries_fail_list(self):
|
||||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||||
network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429])
|
network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429])
|
||||||
response = await network.request('GET', 'https://example.com/')
|
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
self.assertEqual(response.status_code, 403)
|
self.assertEqual(response.status_code, 403)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
|
||||||
async def test_retries_fail_bool(self):
|
async def test_retries_fail_bool(self):
|
||||||
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()):
|
||||||
network = Network(enable_http=True, retries=0, retry_on_http_error=True)
|
network = Network(enable_http=True, retries=0, retry_on_http_error=True)
|
||||||
response = await network.request('GET', 'https://example.com/')
|
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
self.assertEqual(response.status_code, 403)
|
self.assertEqual(response.status_code, 403)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ class TestNetworkRequestRetries(SearxTestCase):
|
||||||
|
|
||||||
with patch.object(httpx.AsyncClient, 'request', new=get_response):
|
with patch.object(httpx.AsyncClient, 'request', new=get_response):
|
||||||
network = Network(enable_http=True, retries=2)
|
network = Network(enable_http=True, retries=2)
|
||||||
response = await network.request('GET', 'https://example.com/')
|
response = await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
|
self.assertEqual(response.text, TestNetworkRequestRetries.TEXT)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
@ -190,7 +190,7 @@ class TestNetworkRequestRetries(SearxTestCase):
|
||||||
with patch.object(httpx.AsyncClient, 'request', new=get_response):
|
with patch.object(httpx.AsyncClient, 'request', new=get_response):
|
||||||
network = Network(enable_http=True, retries=0)
|
network = Network(enable_http=True, retries=0)
|
||||||
with self.assertRaises(httpx.RequestError):
|
with self.assertRaises(httpx.RequestError):
|
||||||
await network.request('GET', 'https://example.com/')
|
await network.request('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
@ -237,6 +237,6 @@ class TestNetworkStreamRetries(SearxTestCase):
|
||||||
|
|
||||||
with patch.object(httpx.AsyncClient, 'stream', new=stream):
|
with patch.object(httpx.AsyncClient, 'stream', new=stream):
|
||||||
network = Network(enable_http=True, retries=0, retry_on_http_error=403)
|
network = Network(enable_http=True, retries=0, retry_on_http_error=403)
|
||||||
response = await network.stream('GET', 'https://example.com/')
|
response = await network.stream('GET', 'https://example.com/', raise_for_httperror=False)
|
||||||
self.assertEqual(response.status_code, 403)
|
self.assertEqual(response.status_code, 403)
|
||||||
await network.aclose()
|
await network.aclose()
|
||||||
|
|
Loading…
Reference in New Issue