From f63ffbb22bd2dab5921e79a3ac934885828f35f0 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 16 Oct 2021 21:56:08 +0200 Subject: [PATCH] [fix] engine - yahoo: rewrite and fix issues Languages are supported by mapping the language to a domain. If domain is not found in :py:obj:`lang2domain` URL ``.search.yahoo.com`` is used. BTW: fix issue reported at https://github.com/searx/searx/issues/3020 Signed-off-by: Markus Heiser --- docs/src/searx.engines.yahoo.rst | 8 ++ searx/data/engines_languages.json | 4 +- searx/engines/yahoo.py | 201 +++++++++++++++--------------- 3 files changed, 111 insertions(+), 102 deletions(-) create mode 100644 docs/src/searx.engines.yahoo.rst diff --git a/docs/src/searx.engines.yahoo.rst b/docs/src/searx.engines.yahoo.rst new file mode 100644 index 000000000..df08550ca --- /dev/null +++ b/docs/src/searx.engines.yahoo.rst @@ -0,0 +1,8 @@ +.. _yahoo engine: + +============ +Yahoo Engine +============ + +.. automodule:: searx.engines.yahoo + :members: diff --git a/searx/data/engines_languages.json b/searx/data/engines_languages.json index a4b0a8a5d..2e57a8f9e 100644 --- a/searx/data/engines_languages.json +++ b/searx/data/engines_languages.json @@ -28418,7 +28418,7 @@ "sv", "th", "tr", - "zh-CHS", - "zh-CHT" + "zh_chs", + "zh_cht" ] } \ No newline at end of file diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index b02af518d..09d06aff0 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -1,12 +1,24 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -"""Yahoo (Web) +"""Yahoo Search (Web) + +Languages are supported by mapping the language to a domain. If domain is not +found in :py:obj:`lang2domain` URL ``.search.yahoo.com`` is used. """ -from urllib.parse import unquote, urlencode +from urllib.parse import ( + unquote, + urlencode, +) from lxml import html -from searx.utils import extract_text, extract_url, match_language, eval_xpath + +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, + match_language, +) # about about = { @@ -22,35 +34,78 @@ about = { categories = ['general'] paging = True time_range_support = True - -# search-url -base_url = 'https://search.yahoo.com/' -search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}' -search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time' - -supported_languages_url = 'https://search.yahoo.com/web/advanced' - -# specific xpath variables -results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]" -url_xpath = './/h3/a/@href' -title_xpath = './/h3/a' -content_xpath = './/div[contains(@class, "compText")]' -suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a" +supported_languages_url = 'https://search.yahoo.com/preferences/languages' +"""Supported languages are read from Yahoo preference page.""" time_range_dict = { - 'day': ['1d', 'd'], - 'week': ['1w', 'w'], - 'month': ['1m', 'm'] + 'day': ('1d', 'd'), + 'week': ('1w', 'w'), + 'month': ('1m', 'm'), } language_aliases = { - 'zh-CN': 'zh-CHS', - 'zh-TW': 'zh-CHT', - 'zh-HK': 'zh-CHT' + 'zh-HK': 'zh_chs', + 'zh-CN': 'zh_chs', # dead since 2015 / routed to hk.search.yahoo.com + 'zh-TW': 'zh_cht', } -# remove yahoo-specific tracking-url +lang2domain = { + 'zh_chs' : 'hk.search.yahoo.com', + 'zh_cht' : 'tw.search.yahoo.com', + 'en' : 'search.yahoo.com', + + 'bg' : 'search.yahoo.com', + 'cs' : 'search.yahoo.com', + 'da' : 'search.yahoo.com', + 'el' : 'search.yahoo.com', + 'et' : 'search.yahoo.com', + 'he' : 'search.yahoo.com', + 'hr' : 'search.yahoo.com', + 'ja' : 'search.yahoo.com', + 'ko' : 'search.yahoo.com', + 'sk' : 'search.yahoo.com', + 'sl' : 'search.yahoo.com', + +} +"""Map language to domain""" + +def _get_language(params): + + lang = language_aliases.get(params['language']) + if lang is None: + lang = match_language( + params['language'], supported_languages, language_aliases + ) + lang = lang.split('-')[0] + logger.debug("params['language']: %s --> %s" , params['language'], lang) + return lang + +def request(query, params): + """build request""" + offset = (params['pageno'] - 1) * 7 + 1 + lang = _get_language(params) + age, btf = time_range_dict.get( + params['time_range'], ('', '')) + + args = urlencode({ + 'p' : query, + 'ei' : 'UTF-8', + 'fl' : 1, + 'vl' : 'lang_' + lang, + 'btf' : btf, + 'fr2' : 'time', + 'age' : age, + 'b' : offset, + 'xargs' :0 + }) + + domain = lang2domain.get(lang, '%s.search.yahoo.com' % lang) + params['url'] = 'https://%s/search?%s' % (domain, args) + return params + def parse_url(url_string): + """remove yahoo-specific tracking-url""" + endings = ['/RS', '/RK'] endpositions = [] start = url_string.find('http', url_string.find('/RU=') + 1) @@ -66,73 +121,30 @@ def parse_url(url_string): end = min(endpositions) return unquote(url_string[start:end]) -def _get_url(query, offset, language, time_range): - if time_range in time_range_dict: - return base_url + search_url_with_time.format( - offset = offset, - query = urlencode({'p': query}), - lang = language, - age = time_range_dict[time_range][0], - btf = time_range_dict[time_range][1] - ) - return base_url + search_url.format( - offset=offset, - query=urlencode({'p': query}), - lang=language - ) - -def _get_language(params): - if params['language'] == 'all': - return 'en' - - language = match_language(params['language'], supported_languages, language_aliases) - if language not in language_aliases.values(): - language = language.split('-')[0] - language = language.replace('-', '_').lower() - - return language - - -# do search-request -def request(query, params): - if params['time_range'] and params['time_range'] not in time_range_dict: - return params - - offset = (params['pageno'] - 1) * 10 + 1 - language = _get_language(params) - - params['url'] = _get_url(query, offset, language, params['time_range']) - - return params - - -# get response from search-request def response(resp): - results = [] + """parse response""" + results = [] dom = html.fromstring(resp.text) - try: - results_num = int( - eval_xpath( - dom, - '//div[@class="compPagination"]/span[last()]/text()' - )[0].split()[0].replace(',', '') - ) - results.append({'number_of_results': results_num}) - except: # pylint: disable=bare-except - pass - # parse results - for result in eval_xpath(dom, results_xpath): - try: - url = parse_url(extract_url(eval_xpath(result, url_xpath), search_url)) - title = extract_text(eval_xpath(result, title_xpath)[0]) - - except: # pylint: disable=bare-except + for result in eval_xpath_list(dom, '//div[contains(@class,"algo-sr")]'): + url = eval_xpath_getindex(result, './/h3/a/@href', 0, default=None) + if url is None: continue + url = parse_url(url) - content = extract_text(eval_xpath(result, content_xpath)[0]) + title = eval_xpath_getindex(result, './/h3/a', 0, default=None) + if title is None: + continue + offset = len(extract_text(title.xpath('span'))) + title = extract_text(title)[offset:] + + content = eval_xpath_getindex( + result, './/div[contains(@class, "compText")]', 0, default='' + ) + if content: + content = extract_text(content) # append result results.append({ @@ -141,17 +153,10 @@ def response(resp): 'content': content }) - # if no suggestion found, return results - suggestions = eval_xpath(dom, suggestion_xpath) - if not suggestions: - return results - - # parse suggestion - for suggestion in suggestions: + for suggestion in eval_xpath_list(dom, '//div[contains(@class, "AlsoTry")]'): # append suggestion results.append({'suggestion': extract_text(suggestion)}) - # return results return results @@ -159,13 +164,9 @@ def response(resp): def _fetch_supported_languages(resp): supported_languages = [] dom = html.fromstring(resp.text) - options = eval_xpath(dom, '//div[@id="yschlang"]/span/label/input') - for option in options: - code_parts = eval_xpath(option, './@value')[0][5:].split('_') - if len(code_parts) == 2: - code = code_parts[0] + '-' + code_parts[1].upper() - else: - code = code_parts[0] - supported_languages.append(code) + offset = len('lang_') + + for val in eval_xpath_list(dom, '//div[contains(@class, "lang-item")]/input/@value'): + supported_languages.append( val[offset:] ) return supported_languages