diff --git a/docs/src/searx.engines.google.rst b/docs/src/searx.engines.google.rst index 2d10b5eea..cefe67359 100644 --- a/docs/src/searx.engines.google.rst +++ b/docs/src/searx.engines.google.rst @@ -30,6 +30,14 @@ Google WEB .. automodule:: searx.engines.google :members: +.. _google custom search engine: + +Google Custom Search +==================== + +.. automodule:: searx.engines.google_cs + :members: + .. _google images engine: Google Images diff --git a/searx/engines/google_cs.py b/searx/engines/google_cs.py new file mode 100644 index 000000000..8666336d0 --- /dev/null +++ b/searx/engines/google_cs.py @@ -0,0 +1,241 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +""" + Google Custom Search API engine + + This engine use Google's paid search API, which requires an API key and do not subject to CAPTCHA. + The search API has 100 queries/day free tier, and an initial cap of 10k search/day which can be raised by submitting + a request. The search will use a different algorithm than what Google.com provides. + + Setting up + ---------- + + 1. Create a `Google Cloud project `_ + 2. *(optional)* Attach a billing account to the project to enable search quota above the free tier + 3. Enable the `Custom Search API `_ + 4. Create an `API key `_ + 5. *(optional)* Limit the API key to :guilabel:`Custom Search API` and public IP address of the Searx server + 6. Create a `custom search engine `_. + + * Enable :guilabel:`Image search` + * Enable :guilabel:`Search the entire web` + * Other options are not required, including paid element API key + + 7. Add the information to :file:`searx.yml` + + .. code-block:: yaml + + engines: + - name: google custom search + engine: google_cs + shortcut: gocs + api_key: Enter API key from step 4 + cx: Enter search engine ID from step 6 + + 8. *(optional)* Protect the engine with :doc:`/admin/engines/private-engines` to prevent costly mistakes + +""" +from urllib.parse import urlencode + +from searx.engines.google import get_lang_info +from searx.exceptions import SearxEngineAPIException, SearxEngineTooManyRequestsException +from searx.network import raise_for_httperror + +about = { + "website": 'https://www.google.com', + "wikidata_id": 'Q9366', + "official_api_documentation": 'https://developers.google.com/custom-search/v1/overview', + "use_official_api": True, + "require_api_key": True, + "results": 'HTML', +} + +# engine dependent config +categories = ['general', 'web', 'images'] +paging = True +time_range_support = True +safesearch = True +send_accept_language_header = True + +# search-url +base_url = "https://customsearch.googleapis.com/customsearch/v1?{query}" +api_key = None +cx = None +number_of_results = 10 # 1 - 10 + +MAX_SEARCH_RESULT = 100 + +time_range_map = { + 'day': 'd[1]', + 'week': 'w[1]', + 'month': 'm[1]', + 'year': 'y[1]', +} + +# https://developers.google.com/custom-search/docs/json_api_reference#international-values +supported_languages = { + "af": {"Name": "Afrikaans"}, + "sq": {"Name": "Albanian"}, + "sm": {"Name": "Amharic"}, + "ar": {"Name": "Arabic"}, + "az": {"Name": "Azerbaijani"}, + "eu": {"Name": "Basque"}, + "be": {"Name": "Belarusian"}, + "bn": {"Name": "Bengali"}, + "bh": {"Name": "Bihari"}, + "bs": {"Name": "Bosnian"}, + "bg": {"Name": "Bulgarian"}, + "ca": {"Name": "Catalan"}, + "zh-CN": {"Name": "Chinese (Simplified)"}, + "zh-TW": {"Name": "Chinese (Traditional)"}, + "hr": {"Name": "Croatian"}, + "cs": {"Name": "Czech"}, + "da": {"Name": "Danish"}, + "nl": {"Name": "Dutch"}, + "en": {"Name": "English"}, + "eo": {"Name": "Esperanto"}, + "et": {"Name": "Estonian"}, + "fo": {"Name": "Faroese"}, + "fi": {"Name": "Finnish"}, + "fr": {"Name": "French"}, + "fy": {"Name": "Frisian"}, + "gl": {"Name": "Galician"}, + "ka": {"Name": "Georgian"}, + "de": {"Name": "German"}, + "el": {"Name": "Greek"}, + "gu": {"Name": "Gujarati"}, + "iw": {"Name": "Hebrew"}, + "hi": {"Name": "Hindi"}, + "hu": {"Name": "Hungarian"}, + "is": {"Name": "Icelandic"}, + "id": {"Name": "Indonesian"}, + "ia": {"Name": "Interlingua"}, + "ga": {"Name": "Irish"}, + "it": {"Name": "Italian"}, + "ja": {"Name": "Japanese"}, + "jw": {"Name": "Javanese"}, + "kn": {"Name": "Kannada"}, + "ko": {"Name": "Korean"}, + "la": {"Name": "Latin"}, + "lv": {"Name": "Latvian"}, + "lt": {"Name": "Lithuanian"}, + "mk": {"Name": "Macedonian"}, + "ms": {"Name": "Malay"}, + "ml": {"Name": "Malayam"}, + "mt": {"Name": "Maltese"}, + "mr": {"Name": "Marathi"}, + "ne": {"Name": "Nepali"}, + "no": {"Name": "Norwegian"}, + "nn": {"Name": "Norwegian (Nynorsk)"}, + "oc": {"Name": "Occitan"}, + "fa": {"Name": "Persian"}, + "pl": {"Name": "Polish"}, + "pt-BR": {"Name": "Portuguese (Brazil)"}, + "pt-PT": {"Name": "Portuguese (Portugal)"}, + "pa": {"Name": "Punjabi"}, + "ro": {"Name": "Romanian"}, + "ru": {"Name": "Russian"}, + "gd": {"Name": "Scots Gaelic"}, + "sr": {"Name": "Serbian"}, + "si": {"Name": "Sinhalese"}, + "sk": {"Name": "Slovak"}, + "sl": {"Name": "Slovenian"}, + "es": {"Name": "Spanish"}, + "su": {"Name": "Sudanese"}, + "sw": {"Name": "Swahili"}, + "sv": {"Name": "Swedish"}, + "tl": {"Name": "Tagalog"}, + "ta": {"Name": "Tamil"}, + "te": {"Name": "Telugu"}, + "th": {"Name": "Thai"}, + "ti": {"Name": "Tigrinya"}, + "tr": {"Name": "Turkish"}, + "uk": {"Name": "Ukrainian"}, + "ur": {"Name": "Urdu"}, + "uz": {"Name": "Uzbek"}, + "vi": {"Name": "Vietnamese"}, + "cy": {"Name": "Welsh"}, + "xh": {"Name": "Xhosa"}, + "zu": {"Name": "Zulu"}, +} + + +def request(query, params): + start = (params['pageno'] * number_of_results) + 1 + + if start > MAX_SEARCH_RESULT: + raise PageTooLargeException + + query = { + 'key': api_key, + 'cx': cx, + 'q': query, + 'safe': 'active' if params['safesearch'] > 0 else 'off', + 'num': number_of_results, + 'start': start, + } + + if params['category'] == 'images': + query['searchType'] = 'image' + + if params.get('time_range', None) in time_range_map: + query['dateRestrict'] = time_range_map[params['time_range']] + + lang_info = get_lang_info(params, supported_languages, {}, True) + query['gl'] = lang_info['country'].lower() + query['hl'] = lang_info['params']['hl'] + if 'lr' in lang_info['params']: + query['lr'] = lang_info['params']['lr'] + + params['url'] = base_url.format(query=urlencode(query)) + params['raise_for_httperror'] = False + return params + + +def response(resp): + result = resp.json() + + if resp.status_code == 403: + try: + if result['errors'][0]['reason'] == 'quotaExceeded': + raise SearxEngineTooManyRequestsException(message=result['message']) + except (KeyError, IndexError): + pass + + raise_for_httperror(resp) + + metadata = [ + {'number_of_results': min(MAX_SEARCH_RESULT, int(result['searchInformation']['totalResults'], 10))}, + ] + search_type = result['queries']['request'][0].get('searchType', '') + + if 'spelling' in result: + metadata.append({'correction': result['spelling']['correctedQuery']}) + + return metadata + [_convert_result(search, search_type) for search in result.get('items', [])] + + +def _convert_result(search, search_type=''): + """Convert `result JSON `_ + to Searx result""" + out = { + "url": search['link'], + "title": search['title'], + "content": search.get('snippet', ''), + } + + if search_type == 'image' and 'image' in search: + out['template'] = 'images.html' + out['img_src'] = search['link'] + out['thumbnail_src'] = search['image']['thumbnailLink'] + out['img_format'] = f"{search['image']['width']} x {search['image']['height']} {search['fileFormat']}" + out['url'] = search['image']['contextLink'] + + return out + + +class PageTooLargeException(SearxEngineAPIException): + """Requested page size is over Google's maximum limit""" + + def __init__(self): + super().__init__('Page size too large') diff --git a/searx/settings.yml b/searx/settings.yml index f21d7f05a..ad7f21ea6 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -707,6 +707,14 @@ engines: # additional_tests: # android: *test_android + # - name: google custom search + # engine: google_cs + # shortcut: gocs + # API Key and custom search ID required + # see https://docs.searxng.org/src/searx.engines.google.html#google-custom-search + # api_key: + # cx: + # - name: google italian # engine: google # shortcut: goit