[enh] add raise_for_httperror

check HTTP response:
* detect some comme CAPTCHA challenge (no solving). In this case the engine is suspended for long a time.
* otherwise raise HTTPError as before

the check is done in poolrequests.py (was before in search.py).

update qwant, wikipedia, wikidata to use raise_for_httperror instead of raise_for_status
This commit is contained in:
Alexandre Flament 2020-12-09 21:23:20 +01:00
parent 033f39bff7
commit d703119d3a
11 changed files with 179 additions and 56 deletions

View File

@ -134,9 +134,9 @@ The function ``def request(query, params):`` always returns the ``params``
variable. Inside searx, the following paramters can be used to specify a search
request:
================== =========== ==========================================================================
=================== =========== ==========================================================================
argument type information
================== =========== ==========================================================================
=================== =========== ==========================================================================
url string requested url
method string HTTP request method
headers set HTTP header information
@ -145,8 +145,8 @@ cookies set HTTP cookies
verify boolean Performing SSL-Validity check
max_redirects int maximum redirects, hard limit
soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
raise_for_status bool True by default: raise an exception if the HTTP code of response is >= 300
================== =========== ==========================================================================
raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300
=================== =========== ==========================================================================
example code

View File

@ -281,7 +281,11 @@ def initialize_engines(engine_list):
load_engines(engine_list)
def engine_init(engine_name, init_fn):
try:
init_fn(get_engine_from_settings(engine_name))
except Exception:
logger.exception('%s engine: Fail to initialize', engine_name)
else:
logger.debug('%s engine: Initialized', engine_name)
for engine_name, engine in engines.items():

View File

@ -14,6 +14,8 @@ from datetime import datetime
from json import loads
from urllib.parse import urlencode
from searx.utils import html_to_text, match_language
from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
from searx.raise_for_httperror import raise_for_httperror
# engine dependent config
@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region'
category_to_keyword = {'general': 'web',
'images': 'images',
'news': 'news',
'social media': 'social'}
'news': 'news'}
# search-url
url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
@ -51,6 +52,7 @@ def request(query, params):
params['url'] += '&locale=' + language.replace('-', '_').lower()
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
params['raise_for_httperror'] = False
return params
@ -58,8 +60,20 @@ def request(query, params):
def response(resp):
results = []
# According to https://www.qwant.com/js/app.js
if resp.status_code == 429:
raise SearxEngineCaptchaException()
# raise for other errors
raise_for_httperror(resp)
# load JSON result
search_results = loads(resp.text)
# check for an API error
if search_results.get('status') != 'success':
raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
# return empty array if there are no results
if 'data' not in search_results:
return []
@ -90,15 +104,6 @@ def response(resp):
'thumbnail_src': thumbnail_src,
'img_src': img_src})
elif category_to_keyword.get(categories[0], '') == 'social':
published_date = datetime.fromtimestamp(result['date'], None)
img_src = result.get('img', None)
results.append({'url': res_url,
'title': title,
'publishedDate': published_date,
'content': content,
'img_src': img_src})
elif category_to_keyword.get(categories[0], '') == 'news':
published_date = datetime.fromtimestamp(result['date'], None)
media = result.get('media', [])

View File

@ -161,9 +161,6 @@ def request(query, params):
def response(resp):
results = []
if resp.status_code != 200:
logger.debug('SPARQL endpoint error %s', resp.content.decode())
resp.raise_for_status()
jsonresponse = loads(resp.content.decode())
language = resp.search_params['language'].lower()

View File

@ -14,6 +14,7 @@ from urllib.parse import quote
from json import loads
from lxml.html import fromstring
from searx.utils import match_language, searx_useragent
from searx.raise_for_httperror import raise_for_httperror
# search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
@ -37,7 +38,7 @@ def request(query, params):
language=url_lang(params['language']))
params['headers']['User-Agent'] = searx_useragent()
params['raise_for_status'] = False
params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2
return params
@ -47,6 +48,7 @@ def request(query, params):
def response(resp):
if resp.status_code == 404:
return []
raise_for_httperror(resp)
results = []
api_result = loads(resp.text)

View File

@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException):
"""The website has returned an application error"""
class SearxEngineCaptchaException(SearxEngineResponseException):
"""The website has returned a CAPTCHA"""
class SearxEngineAccessDeniedException(SearxEngineResponseException):
"""The website is blocking the access"""
def __init__(self, suspended_time=24 * 3600, message='Access denied'):
super().__init__(message + ', suspended_time=' + str(suspended_time))
self.suspended_time = suspended_time
self.message = message
class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
"""The website has returned a CAPTCHA
By default, searx stops sending requests to this engine for 1 day.
"""
def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'):
super().__init__(message=message, suspended_time=suspended_time)
class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
"""The website has returned a Too Many Request status code
By default, searx stops sending requests to this engine for 1 hour.
"""
def __init__(self, suspended_time=3600, message='Too many request'):
super().__init__(message=message, suspended_time=suspended_time)
class SearxEngineXPathException(SearxEngineResponseException):

View File

@ -4,7 +4,8 @@ import logging
from json import JSONDecodeError
from urllib.parse import urlparse
from requests.exceptions import RequestException
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
SearxEngineAccessDeniedException)
from searx import logger
@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple:
return (exc.xpath_str, exc.message)
if isinstance(exc, SearxEngineXPathException):
return (exc.xpath_str, exc.message)
if isinstance(exc, SearxEngineAPIException):
return (str(exc.args[0]), )
if isinstance(exc, SearxEngineAccessDeniedException):
return (exc.message, )
return ()

View File

@ -7,6 +7,7 @@ import requests
from searx import settings
from searx import logger
from searx.raise_for_httperror import raise_for_httperror
logger = logger.getChild('poolrequests')
@ -156,6 +157,12 @@ def request(method, url, **kwargs):
if timeout is not None:
kwargs['timeout'] = timeout
# raise_for_error
check_for_httperror = True
if 'raise_for_httperror' in kwargs:
check_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
# do request
response = session.request(method=method, url=url, **kwargs)
@ -176,6 +183,10 @@ def request(method, url, **kwargs):
if hasattr(threadLocal, 'total_time'):
threadLocal.total_time += time_after_request - time_before_request
# raise an exception
if check_for_httperror:
raise_for_httperror(response)
return response

View File

@ -0,0 +1,66 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Raise exception for an HTTP response is an error.
"""
from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException,
SearxEngineAccessDeniedException)
def is_cloudflare_challenge(resp):
if resp.status_code in [429, 503]:
if ('__cf_chl_jschl_tk__=' in resp.text)\
or ('/cdn-cgi/challenge-platform/' in resp.text
and 'orchestrate/jsch/v1' in resp.text
and 'window._cf_chl_enter(' in resp.text):
return True
if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text:
return True
return False
def is_cloudflare_firewall(resp):
return resp.status_code == 403 and '<span class="cf-error-code">1020</span>' in resp.text
def raise_for_cloudflare_captcha(resp):
if resp.headers.get('Server', '').startswith('cloudflare'):
if is_cloudflare_challenge(resp):
# https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
# suspend for 2 weeks
raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15)
if is_cloudflare_firewall(resp):
raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24)
def raise_for_recaptcha(resp):
if resp.status_code == 503 \
and '"https://www.google.com/recaptcha/' in resp.text:
raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
def raise_for_captcha(resp):
raise_for_cloudflare_captcha(resp)
raise_for_recaptcha(resp)
def raise_for_httperror(resp):
"""Raise exception for an HTTP response is an error.
Args:
resp (requests.Response): Response to check
Raises:
requests.HTTPError: raise by resp.raise_for_status()
searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403.
searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429.
searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected.
"""
if resp.status_code and resp.status_code >= 400:
raise_for_captcha(resp)
if resp.status_code in (402, 403):
raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code),
suspended_time=3600 * 24)
if resp.status_code == 429:
raise SearxEngineTooManyRequestsException()
resp.raise_for_status()

View File

@ -32,7 +32,8 @@ from searx.utils import gen_useragent
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
from searx.exceptions import SearxEngineCaptchaException
from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,)
from searx.metrology.error_recorder import record_exception, record_error
@ -131,6 +132,9 @@ def send_http_request(engine, request_params):
# soft_max_redirects
soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0)
# raise_for_status
request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False)
# specific type of request (GET or POST)
if request_params['method'] == 'GET':
req = requests_lib.get
@ -142,10 +146,6 @@ def send_http_request(engine, request_params):
# send the request
response = req(request_params['url'], **request_args)
# check HTTP status
if request_params.get('raise_for_status'):
response.raise_for_status()
# check soft limit of the redirect count
if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error
@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
# suppose everything will be alright
requests_exception = False
suspended_time = None
try:
# send requests and parse the results
@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA')
suspended_time = e.suspended_time # pylint: disable=no-member
elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
result_container.add_unresponsive_engine(engine_name, 'too many requests')
logger.exception('engine {0} : Too many requests')
suspended_time = e.suspended_time # pylint: disable=no-member
elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
result_container.add_unresponsive_engine(engine_name, 'blocked')
logger.exception('engine {0} : Searx is blocked')
suspended_time = e.suspended_time # pylint: disable=no-member
else:
result_container.add_unresponsive_engine(engine_name, 'unexpected crash')
# others errors
@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
if getattr(threading.current_thread(), '_timeout', False):
record_error(engine_name, 'Timeout')
# suspend or not the engine if there are HTTP errors
# suspend the engine if there is an HTTP error
# or suspended_time is defined
with threading.RLock():
if requests_exception:
if requests_exception or suspended_time:
# update continuous_errors / suspend_end_time
engine.continuous_errors += 1
engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'],
if suspended_time is None:
suspended_time = min(settings['search']['max_ban_time_on_fail'],
engine.continuous_errors * settings['search']['ban_time_on_fail'])
engine.suspend_end_time = time() + suspended_time
else:
# no HTTP error (perhaps an engine error)
# anyway, reset the suspend variables
# reset the suspend variables
engine.continuous_errors = 0
engine.suspend_end_time = 0
@ -342,7 +354,7 @@ def default_request_params():
'cookies': {},
'verify': True,
'auth': None,
'raise_for_status': True
'raise_for_httperror': True
}

View File

@ -647,11 +647,6 @@ engines:
shortcut : qwn
categories : news
- name : qwant social
engine : qwant
shortcut : qws
categories : social media
# - name: library
# engine: recoll
# shortcut: lib
@ -817,12 +812,13 @@ engines:
# Or you can use the html non-stable engine, activated by default
engine : youtube_noapi
- name : yggtorrent
engine : yggtorrent
shortcut : ygg
url: https://www2.yggtorrent.si/
disabled : True
timeout : 4.0
# tmp suspended: Cloudflare CAPTCHA
#- name : yggtorrent
# engine : yggtorrent
# shortcut : ygg
# url: https://www2.yggtorrent.si/
# disabled : True
# timeout : 4.0
- name : dailymotion
engine : dailymotion