mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	[fix] engine: duckduckgo - CAPTCHA detection
The previous implementation could not distinguish a CAPTCHA response from an ordinary result list. In the previous implementation a CAPTCHA was taken as a result list where no items are in. DDG does not block IPs. Instead, a CAPTCHA wall is placed in front of request on a dubious request. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
		
							parent
							
								
									88caa1d7db
								
							
						
					
					
						commit
						050451347b
					
				
					 2 changed files with 18 additions and 5 deletions
				
			
		|  | @ -25,6 +25,7 @@ from searx.network import get  # see https://github.com/searxng/searxng/issues/7 | ||||||
| from searx import redisdb | from searx import redisdb | ||||||
| from searx.enginelib.traits import EngineTraits | from searx.enginelib.traits import EngineTraits | ||||||
| from searx.utils import extr | from searx.utils import extr | ||||||
|  | from searx.exceptions import SearxEngineCaptchaException | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     import logging |     import logging | ||||||
|  | @ -292,6 +293,15 @@ def request(query, params): | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def detect_ddg_captcha(dom): | ||||||
|  |     """In case of CAPTCHA ddg open its own *not a Robot* dialog and is | ||||||
|  |     not redirected to CAPTCHA page. | ||||||
|  |     """ | ||||||
|  |     if eval_xpath(dom, "//form[@id='challenge-form']"): | ||||||
|  |         # set suspend time to zero is OK --> ddg does not block the IP | ||||||
|  |         raise SearxEngineCaptchaException(suspended_time=0) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def response(resp): | def response(resp): | ||||||
| 
 | 
 | ||||||
|     if resp.status_code == 303: |     if resp.status_code == 303: | ||||||
|  | @ -299,6 +309,7 @@ def response(resp): | ||||||
| 
 | 
 | ||||||
|     results = [] |     results = [] | ||||||
|     doc = lxml.html.fromstring(resp.text) |     doc = lxml.html.fromstring(resp.text) | ||||||
|  |     detect_ddg_captcha(doc) | ||||||
| 
 | 
 | ||||||
|     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') |     result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| # SPDX-License-Identifier: AGPL-3.0-or-later | # SPDX-License-Identifier: AGPL-3.0-or-later | ||||||
| """Exception types raised by SearXNG modules. | """Exception types raised by SearXNG modules. | ||||||
| """ | """ | ||||||
|  | from __future__ import annotations | ||||||
| 
 | 
 | ||||||
| from typing import Optional, Union | from typing import Optional, Union | ||||||
| 
 | 
 | ||||||
|  | @ -61,7 +62,7 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException): | ||||||
|     """This settings contains the default suspended time (default 86400 sec / 1 |     """This settings contains the default suspended time (default 86400 sec / 1 | ||||||
|     day).""" |     day).""" | ||||||
| 
 | 
 | ||||||
|     def __init__(self, suspended_time: int = None, message: str = 'Access denied'): |     def __init__(self, suspended_time: int | None = None, message: str = 'Access denied'): | ||||||
|         """Generic exception to raise when an engine denies access to the results. |         """Generic exception to raise when an engine denies access to the results. | ||||||
| 
 | 
 | ||||||
|         :param suspended_time: How long the engine is going to be suspended in |         :param suspended_time: How long the engine is going to be suspended in | ||||||
|  | @ -70,12 +71,13 @@ class SearxEngineAccessDeniedException(SearxEngineResponseException): | ||||||
|         :param message: Internal message.  Defaults to ``Access denied`` |         :param message: Internal message.  Defaults to ``Access denied`` | ||||||
|         :type message: str |         :type message: str | ||||||
|         """ |         """ | ||||||
|         suspended_time = suspended_time or self._get_default_suspended_time() |         if suspended_time is None: | ||||||
|  |             suspended_time = self._get_default_suspended_time() | ||||||
|         super().__init__(message + ', suspended_time=' + str(suspended_time)) |         super().__init__(message + ', suspended_time=' + str(suspended_time)) | ||||||
|         self.suspended_time = suspended_time |         self.suspended_time = suspended_time | ||||||
|         self.message = message |         self.message = message | ||||||
| 
 | 
 | ||||||
|     def _get_default_suspended_time(self): |     def _get_default_suspended_time(self) -> int: | ||||||
|         from searx import get_setting  # pylint: disable=C0415 |         from searx import get_setting  # pylint: disable=C0415 | ||||||
| 
 | 
 | ||||||
|         return get_setting(self.SUSPEND_TIME_SETTING) |         return get_setting(self.SUSPEND_TIME_SETTING) | ||||||
|  | @ -88,7 +90,7 @@ class SearxEngineCaptchaException(SearxEngineAccessDeniedException): | ||||||
|     """This settings contains the default suspended time (default 86400 sec / 1 |     """This settings contains the default suspended time (default 86400 sec / 1 | ||||||
|     day).""" |     day).""" | ||||||
| 
 | 
 | ||||||
|     def __init__(self, suspended_time=None, message='CAPTCHA'): |     def __init__(self, suspended_time: int | None = None, message='CAPTCHA'): | ||||||
|         super().__init__(message=message, suspended_time=suspended_time) |         super().__init__(message=message, suspended_time=suspended_time) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -102,7 +104,7 @@ class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException): | ||||||
|     """This settings contains the default suspended time (default 3660 sec / 1 |     """This settings contains the default suspended time (default 3660 sec / 1 | ||||||
|     hour).""" |     hour).""" | ||||||
| 
 | 
 | ||||||
|     def __init__(self, suspended_time=None, message='Too many request'): |     def __init__(self, suspended_time: int | None = None, message='Too many request'): | ||||||
|         super().__init__(message=message, suspended_time=suspended_time) |         super().__init__(message=message, suspended_time=suspended_time) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Markus Heiser
						Markus Heiser