Merge pull request #1071 from return42/fix-lang-dailymotion

[fix] dailymotion engine: filter by language & country
This commit is contained in:
Alexandre Flament 2022-04-16 11:54:49 +02:00 committed by GitHub
commit c474616642
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 202 additions and 23668 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Dailymotion (Videos)
Dailymotion (Videos)
""" """
from json import loads from typing import Set
from datetime import datetime from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import match_language, html_to_text import time
import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.utils import html_to_text
# about # about
about = { about = {
@ -21,23 +26,89 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
# search-url # search-url
# see http://www.dailymotion.com/doc/api/obj-video.html # - https://developers.dailymotion.com/tools/
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa # - https://www.dailymotion.com/doc/api/obj-video.html
supported_languages_url = 'https://api.dailymotion.com/languages'
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
search_url = (
'https://api.dailymotion.com/videos?'
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
).format(
fields=','.join(result_fields),
password_protected= 'false',
private='false',
sort='relevance',
limit=number_of_results,
)
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
# The request query filters by 'languages' & 'country', therefore instead of
# fetching only languages we need to fetch locales.
supported_languages_url = 'https://api.dailymotion.com/locales'
supported_languages_iso639: Set[str] = set()
def init(_engine_settings):
global supported_languages_iso639
supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages])
# do search-request
def request(query, params): def request(query, params):
if params['language'] == 'all':
locale = 'en-US'
else:
locale = match_language(params['language'], supported_languages)
params['url'] = search_url.format( if not query:
query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno'] return False
)
language = params['language']
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
language_iso639 = locale.language
if locale.language not in supported_languages_iso639:
language_iso639 = 'en'
query_args = {
'search': query,
'languages': language_iso639,
'page': params['pageno'],
}
if locale.territory:
localization = locale.language + '_' + locale.territory
if localization in supported_languages:
query_args['country'] = locale.territory
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
query_args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(query_args)
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
params['raise_for_httperror'] = False
return params return params
@ -46,34 +117,51 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = resp.json()
# return empty array if there are no results # check for an API error
if 'list' not in search_res: if 'error' in search_res:
return [] raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results # parse results
for res in search_res['list']: for res in search_res.get('list', []):
title = res['title'] title = res['title']
url = res['url'] url = res['url']
content = html_to_text(res['description']) content = html_to_text(res['description'])
thumbnail = res['thumbnail_360_url'] if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None) publishedDate = datetime.fromtimestamp(res['created_time'], None)
# http to https length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://") thumbnail = thumbnail.replace("http://", "https://")
results.append( item = {
{
'template': 'videos.html', 'template': 'videos.html',
'url': url, 'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
'publishedDate': publishedDate, 'publishedDate': publishedDate,
'iframe_src': "https://www.dailymotion.com/embed/video/" + res['id'], 'length': length,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }
)
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results # return results
return results return results
@ -81,18 +169,8 @@ def response(resp):
# get supported languages from their site # get supported languages from their site
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = {} response_json = resp.json()
return [
response_json = loads(resp.text) item['locale']
for item in response_json['list']
for language in response_json['list']: ]
supported_languages[language['code']] = {}
name = language['native_name']
if name:
supported_languages[language['code']]['name'] = name
english_name = language['name']
if english_name:
supported_languages[language['code']]['english_name'] = english_name
return supported_languages

View File

@ -2,9 +2,7 @@
# list of language codes # list of language codes
# this file is generated automatically by utils/fetch_languages.py # this file is generated automatically by utils/fetch_languages.py
language_codes = ( language_codes = (
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'), ('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'), ('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'), ('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'), ('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
@ -28,20 +26,15 @@ language_codes = (
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'), ('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'), ('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
('fr', 'Français', '', 'French', '\U0001f310'), ('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'), ('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'), ('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'), ('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'), ('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
@ -63,13 +56,10 @@ language_codes = (
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'), ('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'), ('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'), ('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
('zh', '中文', '', 'Chinese', '\U0001f310'), ('zh', '中文', '', 'Chinese', '\U0001f310'),
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'), ('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),

View File

@ -88,6 +88,8 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
self.tags.append(tag) self.tags.append(tag)
if tag == 'br':
self.result.append(' ')
def handle_endtag(self, tag): def handle_endtag(self, tag):
if not self.tags: if not self.tags:
@ -142,7 +144,7 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example' 'Example'
""" """
html_str = html_str.replace('\n', ' ') html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split()) html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor() s = _HTMLTextExtractor()
try: try:

View File

@ -129,8 +129,8 @@ class TestLanguageParser(SearxTestCase):
query = RawTextQuery(':hu-H', []) query = RawTextQuery(':hu-H', [])
self.assertEqual(query.autocomplete_list, [":hu-hu"]) self.assertEqual(query.autocomplete_list, [":hu-hu"])
query = RawTextQuery(':v', []) query = RawTextQuery(':zh-', [])
self.assertEqual(query.autocomplete_list, [':vi', ':tiếng việt', ':việt_nam']) self.assertEqual(query.autocomplete_list, [':zh-cn', ':zh-hk', ':zh-tw'])
class TestTimeoutParser(SearxTestCase): class TestTimeoutParser(SearxTestCase):