mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	bing.py: resolve bing.com/ck/a redirections
add a new function searx.network.multi_requests to send multiple HTTP requests at once
This commit is contained in:
		
							parent
							
								
									2864a67ce9
								
							
						
					
					
						commit
						a1e8af0796
					
				
					 4 changed files with 151 additions and 55 deletions
				
			
		|  | @ -8,7 +8,8 @@ | |||
| import re | ||||
| from urllib.parse import urlencode, urlparse, parse_qs | ||||
| from lxml import html | ||||
| from searx.utils import eval_xpath, extract_text, match_language | ||||
| from searx.utils import eval_xpath, extract_text, eval_xpath_list, match_language | ||||
| from searx.network import multi_requests, Request | ||||
| 
 | ||||
| about = { | ||||
|     "website": 'https://www.bing.com', | ||||
|  | @ -79,30 +80,48 @@ def response(resp): | |||
| 
 | ||||
|     dom = html.fromstring(resp.text) | ||||
| 
 | ||||
|     for result in eval_xpath(dom, '//div[@class="sa_cc"]'): | ||||
| 
 | ||||
|         # IMO //div[@class="sa_cc"] does no longer match | ||||
|         logger.debug('found //div[@class="sa_cc"] --> %s', result) | ||||
| 
 | ||||
|         link = eval_xpath(result, './/h3/a')[0] | ||||
|         url = link.attrib.get('href') | ||||
|         title = extract_text(link) | ||||
|         content = extract_text(eval_xpath(result, './/p')) | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url, 'title': title, 'content': content}) | ||||
| 
 | ||||
|     # parse results again if nothing is found yet | ||||
|     for result in eval_xpath(dom, '//li[@class="b_algo"]'): | ||||
| 
 | ||||
|     url_to_resolve = [] | ||||
|     url_to_resolve_index = [] | ||||
|     for i, result in enumerate(eval_xpath_list(dom, '//li[@class="b_algo"]')): | ||||
| 
 | ||||
|         link = eval_xpath(result, './/h2/a')[0] | ||||
|         url = link.attrib.get('href') | ||||
|         title = extract_text(link) | ||||
|         content = extract_text(eval_xpath(result, './/p')) | ||||
| 
 | ||||
|         # get the real URL either using the URL shown to user or following the Bing URL | ||||
|         if url.startswith('https://www.bing.com/ck/a?'): | ||||
|             url_cite = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) | ||||
|             # Bing can shorten the URL either at the end or in the middle of the string | ||||
|             if ( | ||||
|                 url_cite.startswith('https://') | ||||
|                 and '…' not in url_cite | ||||
|                 and '...' not in url_cite | ||||
|                 and '›' not in url_cite | ||||
|             ): | ||||
|                 # no need for an additional HTTP request | ||||
|                 url = url_cite | ||||
|             else: | ||||
|                 # resolve the URL with an additional HTTP request | ||||
|                 url_to_resolve.append(url.replace('&ntb=1', '&ntb=F')) | ||||
|                 url_to_resolve_index.append(i) | ||||
|                 url = None  # remove the result if the HTTP Bing redirect raise an exception | ||||
| 
 | ||||
|         # append result | ||||
|         results.append({'url': url, 'title': title, 'content': content}) | ||||
| 
 | ||||
|     # resolve all Bing redirections in parallel | ||||
|     request_list = [ | ||||
|         Request.get(u, allow_redirects=False, headers=resp.search_params['headers']) for u in url_to_resolve | ||||
|     ] | ||||
|     response_list = multi_requests(request_list) | ||||
|     for i, redirect_response in enumerate(response_list): | ||||
|         if not isinstance(redirect_response, Exception): | ||||
|             results[url_to_resolve_index[i]]['url'] = redirect_response.headers['location'] | ||||
| 
 | ||||
|     # get number_of_results | ||||
|     try: | ||||
|         result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) | ||||
|         if "-" in result_len_container: | ||||
|  |  | |||
|  | @ -8,7 +8,8 @@ import concurrent.futures | |||
| from queue import SimpleQueue | ||||
| from types import MethodType | ||||
| from timeit import default_timer | ||||
| from typing import Iterable, Tuple | ||||
| from typing import Iterable, NamedTuple, Tuple, List, Dict, Union | ||||
| from contextlib import contextmanager | ||||
| 
 | ||||
| import httpx | ||||
| import anyio | ||||
|  | @ -48,9 +49,23 @@ def get_context_network(): | |||
|     return THREADLOCAL.__dict__.get('network') or get_network() | ||||
| 
 | ||||
| 
 | ||||
| def request(method, url, **kwargs): | ||||
|     """same as requests/requests/api.py request(...)""" | ||||
| @contextmanager | ||||
| def _record_http_time(): | ||||
|     # pylint: disable=too-many-branches | ||||
|     time_before_request = default_timer() | ||||
|     start_time = getattr(THREADLOCAL, 'start_time', time_before_request) | ||||
|     try: | ||||
|         yield start_time | ||||
|     finally: | ||||
|         # update total_time. | ||||
|         # See get_time_for_thread() and reset_time_for_thread() | ||||
|         if hasattr(THREADLOCAL, 'total_time'): | ||||
|             time_after_request = default_timer() | ||||
|             THREADLOCAL.total_time += time_after_request - time_before_request | ||||
| 
 | ||||
| 
 | ||||
| def _get_timeout(start_time, kwargs): | ||||
|     # pylint: disable=too-many-branches | ||||
| 
 | ||||
|     # timeout (httpx) | ||||
|     if 'timeout' in kwargs: | ||||
|  | @ -65,45 +80,84 @@ def request(method, url, **kwargs): | |||
| 
 | ||||
|     # ajdust actual timeout | ||||
|     timeout += 0.2  # overhead | ||||
|     start_time = getattr(THREADLOCAL, 'start_time', time_before_request) | ||||
|     if start_time: | ||||
|         timeout -= default_timer() - start_time | ||||
| 
 | ||||
|     # raise_for_error | ||||
|     check_for_httperror = True | ||||
|     if 'raise_for_httperror' in kwargs: | ||||
|         check_for_httperror = kwargs['raise_for_httperror'] | ||||
|         del kwargs['raise_for_httperror'] | ||||
|     return timeout | ||||
| 
 | ||||
|     # requests compatibility | ||||
|     if isinstance(url, bytes): | ||||
|         url = url.decode() | ||||
| 
 | ||||
|     # network | ||||
|     network = get_context_network() | ||||
| def request(method, url, **kwargs): | ||||
|     """same as requests/requests/api.py request(...)""" | ||||
|     with _record_http_time() as start_time: | ||||
|         network = get_context_network() | ||||
|         timeout = _get_timeout(start_time, kwargs) | ||||
|         future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) | ||||
|         try: | ||||
|             return future.result(timeout) | ||||
|         except concurrent.futures.TimeoutError as e: | ||||
|             raise httpx.TimeoutException('Timeout', request=None) from e | ||||
| 
 | ||||
|     # do request | ||||
|     future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) | ||||
|     try: | ||||
|         response = future.result(timeout) | ||||
|     except concurrent.futures.TimeoutError as e: | ||||
|         raise httpx.TimeoutException('Timeout', request=None) from e | ||||
| 
 | ||||
|     # requests compatibility | ||||
|     # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses | ||||
|     response.ok = not response.is_error | ||||
| def multi_requests(request_list: List["Request"]) -> List[Union[httpx.Response, Exception]]: | ||||
|     """send multiple HTTP requests in parallel. Wait for all requests to finish.""" | ||||
|     with _record_http_time() as start_time: | ||||
|         # send the requests | ||||
|         network = get_context_network() | ||||
|         loop = get_loop() | ||||
|         future_list = [] | ||||
|         for request_desc in request_list: | ||||
|             timeout = _get_timeout(start_time, request_desc.kwargs) | ||||
|             future = asyncio.run_coroutine_threadsafe( | ||||
|                 network.request(request_desc.method, request_desc.url, **request_desc.kwargs), loop | ||||
|             ) | ||||
|             future_list.append((future, timeout)) | ||||
| 
 | ||||
|     # update total_time. | ||||
|     # See get_time_for_thread() and reset_time_for_thread() | ||||
|     if hasattr(THREADLOCAL, 'total_time'): | ||||
|         time_after_request = default_timer() | ||||
|         THREADLOCAL.total_time += time_after_request - time_before_request | ||||
|         # read the responses | ||||
|         responses = [] | ||||
|         for future, timeout in future_list: | ||||
|             try: | ||||
|                 responses.append(future.result(timeout)) | ||||
|             except concurrent.futures.TimeoutError: | ||||
|                 responses.append(httpx.TimeoutException('Timeout', request=None)) | ||||
|             except Exception as e:  # pylint: disable=broad-except | ||||
|                 responses.append(e) | ||||
|         return responses | ||||
| 
 | ||||
|     # raise an exception | ||||
|     if check_for_httperror: | ||||
|         raise_for_httperror(response) | ||||
| 
 | ||||
|     return response | ||||
| class Request(NamedTuple): | ||||
|     """Request description for the multi_requests function""" | ||||
| 
 | ||||
|     method: str | ||||
|     url: str | ||||
|     kwargs: Dict[str, str] = {} | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def get(url, **kwargs): | ||||
|         return Request('GET', url, kwargs) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def options(url, **kwargs): | ||||
|         return Request('OPTIONS', url, kwargs) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def head(url, **kwargs): | ||||
|         return Request('HEAD', url, kwargs) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def post(url, **kwargs): | ||||
|         return Request('POST', url, kwargs) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def put(url, **kwargs): | ||||
|         return Request('PUT', url, kwargs) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def patch(url, **kwargs): | ||||
|         return Request('PATCH', url, kwargs) | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def delete(url, **kwargs): | ||||
|         return Request('DELETE', url, kwargs) | ||||
| 
 | ||||
| 
 | ||||
| def get(url, **kwargs): | ||||
|  |  | |||
|  | @ -13,6 +13,7 @@ import httpx | |||
| 
 | ||||
| from searx import logger, searx_debug | ||||
| from .client import new_client, get_loop, AsyncHTTPTransportNoHttp | ||||
| from .raise_for_httperror import raise_for_httperror | ||||
| 
 | ||||
| 
 | ||||
| logger = logger.getChild('network') | ||||
|  | @ -226,6 +227,27 @@ class Network: | |||
|             kwargs['follow_redirects'] = kwargs.pop('allow_redirects') | ||||
|         return kwargs_clients | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def extract_do_raise_for_httperror(kwargs): | ||||
|         do_raise_for_httperror = True | ||||
|         if 'raise_for_httperror' in kwargs: | ||||
|             do_raise_for_httperror = kwargs['raise_for_httperror'] | ||||
|             del kwargs['raise_for_httperror'] | ||||
|         return do_raise_for_httperror | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def patch_response(response, do_raise_for_httperror): | ||||
|         if isinstance(response, httpx.Response): | ||||
|             # requests compatibility (response is not streamed) | ||||
|             # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses | ||||
|             response.ok = not response.is_error | ||||
| 
 | ||||
|             # raise an exception | ||||
|             if do_raise_for_httperror: | ||||
|                 raise_for_httperror(response) | ||||
| 
 | ||||
|         return response | ||||
| 
 | ||||
|     def is_valid_response(self, response): | ||||
|         # pylint: disable=too-many-boolean-expressions | ||||
|         if ( | ||||
|  | @ -239,6 +261,7 @@ class Network: | |||
|     async def call_client(self, stream, method, url, **kwargs): | ||||
|         retries = self.retries | ||||
|         was_disconnected = False | ||||
|         do_raise_for_httperror = Network.extract_do_raise_for_httperror(kwargs) | ||||
|         kwargs_clients = Network.extract_kwargs_clients(kwargs) | ||||
|         while retries >= 0:  # pragma: no cover | ||||
|             client = await self.get_client(**kwargs_clients) | ||||
|  | @ -248,7 +271,7 @@ class Network: | |||
|                 else: | ||||
|                     response = await client.request(method, url, **kwargs) | ||||
|                 if self.is_valid_response(response) or retries <= 0: | ||||
|                     return response | ||||
|                     return Network.patch_response(response, do_raise_for_httperror) | ||||
|             except httpx.RemoteProtocolError as e: | ||||
|                 if not was_disconnected: | ||||
|                     # the server has closed the connection: | ||||
|  |  | |||
|  | @ -141,28 +141,28 @@ class TestNetworkRequestRetries(SearxTestCase): | |||
|     async def test_retries_ok(self): | ||||
|         with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): | ||||
|             network = Network(enable_http=True, retries=1, retry_on_http_error=403) | ||||
|             response = await network.request('GET', 'https://example.com/') | ||||
|             response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) | ||||
|             await network.aclose() | ||||
| 
 | ||||
|     async def test_retries_fail_int(self): | ||||
|         with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): | ||||
|             network = Network(enable_http=True, retries=0, retry_on_http_error=403) | ||||
|             response = await network.request('GET', 'https://example.com/') | ||||
|             response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             self.assertEqual(response.status_code, 403) | ||||
|             await network.aclose() | ||||
| 
 | ||||
|     async def test_retries_fail_list(self): | ||||
|         with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): | ||||
|             network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429]) | ||||
|             response = await network.request('GET', 'https://example.com/') | ||||
|             response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             self.assertEqual(response.status_code, 403) | ||||
|             await network.aclose() | ||||
| 
 | ||||
|     async def test_retries_fail_bool(self): | ||||
|         with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): | ||||
|             network = Network(enable_http=True, retries=0, retry_on_http_error=True) | ||||
|             response = await network.request('GET', 'https://example.com/') | ||||
|             response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             self.assertEqual(response.status_code, 403) | ||||
|             await network.aclose() | ||||
| 
 | ||||
|  | @ -178,7 +178,7 @@ class TestNetworkRequestRetries(SearxTestCase): | |||
| 
 | ||||
|         with patch.object(httpx.AsyncClient, 'request', new=get_response): | ||||
|             network = Network(enable_http=True, retries=2) | ||||
|             response = await network.request('GET', 'https://example.com/') | ||||
|             response = await network.request('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             self.assertEqual(response.status_code, 200) | ||||
|             self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) | ||||
|             await network.aclose() | ||||
|  | @ -190,7 +190,7 @@ class TestNetworkRequestRetries(SearxTestCase): | |||
|         with patch.object(httpx.AsyncClient, 'request', new=get_response): | ||||
|             network = Network(enable_http=True, retries=0) | ||||
|             with self.assertRaises(httpx.RequestError): | ||||
|                 await network.request('GET', 'https://example.com/') | ||||
|                 await network.request('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             await network.aclose() | ||||
| 
 | ||||
| 
 | ||||
|  | @ -237,6 +237,6 @@ class TestNetworkStreamRetries(SearxTestCase): | |||
| 
 | ||||
|         with patch.object(httpx.AsyncClient, 'stream', new=stream): | ||||
|             network = Network(enable_http=True, retries=0, retry_on_http_error=403) | ||||
|             response = await network.stream('GET', 'https://example.com/') | ||||
|             response = await network.stream('GET', 'https://example.com/', raise_for_httperror=False) | ||||
|             self.assertEqual(response.status_code, 403) | ||||
|             await network.aclose() | ||||
|  |  | |||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Alexandre Flament
						Alexandre Flament