[fix] dailymotion engine: filter by language & country

- fix the issue of fetching more the 7000 *languages*
- improve the request function and filter by language & country
- implement time_range_support & safesearch
- add more fields to the response from dailymotion (allow_embed, length)
- better clean up of HTML tags in the 'content' field.

This is more or less a complete rework based on the '/videos' API from [1].
This patch cleans up the language list in SearXNG that has been polluted by the
ISO-639-3 2 and 3 letter codes from dailymotion languages which have never been
used.

[1] https://developers.dailymotion.com/tools/

Closes: https://github.com/searxng/searxng/issues/1065
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-04-08 11:17:45 +02:00
parent 27f8fa6fe0
commit 3bb62823ec
4 changed files with 188 additions and 23667 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Dailymotion (Videos)
Dailymotion (Videos)
""" """
from json import loads from typing import Set
from datetime import datetime from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import match_language, html_to_text import time
import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.utils import html_to_text
# about # about
about = { about = {
@ -21,23 +26,78 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
# search-url # search-url
# see http://www.dailymotion.com/doc/api/obj-video.html # - https://developers.dailymotion.com/tools/
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa # - https://www.dailymotion.com/doc/api/obj-video.html
supported_languages_url = 'https://api.dailymotion.com/languages'
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
search_url = (
'https://api.dailymotion.com/videos?'
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
).format(
fields=','.join(result_fields),
password_protected= 'false',
private='false',
sort='relevance',
limit=number_of_results,
)
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
# The request query filters by 'languages' & 'country', therefore instead of
# fetching only languages we need to fetch locales.
supported_languages_url = 'https://api.dailymotion.com/locales'
# do search-request
def request(query, params): def request(query, params):
if params['language'] == 'all':
locale = 'en-US'
else:
locale = match_language(params['language'], supported_languages)
params['url'] = search_url.format( if not query:
query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno'] return False
)
language = params['language']
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
query_args = {
'search': query,
'languages': locale.language,
'page': params['pageno'],
}
if locale.territory:
localization = locale.language + '_' + locale.territory
if localization in supported_languages:
query_args['country'] = locale.territory
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
query_args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(query_args)
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
params['raise_for_httperror'] = False
return params return params
@ -46,34 +106,51 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
search_res = loads(resp.text) search_res = resp.json()
# return empty array if there are no results # check for an API error
if 'list' not in search_res: if 'error' in search_res:
return [] raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results # parse results
for res in search_res['list']: for res in search_res.get('list', []):
title = res['title'] title = res['title']
url = res['url'] url = res['url']
content = html_to_text(res['description']) content = html_to_text(res['description'])
thumbnail = res['thumbnail_360_url'] if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None) publishedDate = datetime.fromtimestamp(res['created_time'], None)
# http to https length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://") thumbnail = thumbnail.replace("http://", "https://")
results.append( item = {
{
'template': 'videos.html', 'template': 'videos.html',
'url': url, 'url': url,
'title': title, 'title': title,
'content': content, 'content': content,
'publishedDate': publishedDate, 'publishedDate': publishedDate,
'iframe_src': "https://www.dailymotion.com/embed/video/" + res['id'], 'length': length,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }
)
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results # return results
return results return results
@ -81,18 +158,8 @@ def response(resp):
# get supported languages from their site # get supported languages from their site
def _fetch_supported_languages(resp): def _fetch_supported_languages(resp):
supported_languages = {} response_json = resp.json()
return [
response_json = loads(resp.text) item['locale']
for item in response_json['list']
for language in response_json['list']: ]
supported_languages[language['code']] = {}
name = language['native_name']
if name:
supported_languages[language['code']]['name'] = name
english_name = language['name']
if english_name:
supported_languages[language['code']]['english_name'] = english_name
return supported_languages

View File

@ -2,9 +2,7 @@
# list of language codes # list of language codes
# this file is generated automatically by utils/fetch_languages.py # this file is generated automatically by utils/fetch_languages.py
language_codes = ( language_codes = (
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'), ('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'), ('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'), ('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'), ('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
@ -28,20 +26,15 @@ language_codes = (
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'), ('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'), ('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'), ('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'), ('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
('fr', 'Français', '', 'French', '\U0001f310'), ('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'), ('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'), ('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'), ('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'), ('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'), ('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'), ('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'), ('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'), ('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'), ('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'), ('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
@ -63,13 +56,10 @@ language_codes = (
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'), ('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'), ('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'), ('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'), ('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'), ('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'), ('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'), ('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
('zh', '中文', '', 'Chinese', '\U0001f310'), ('zh', '中文', '', 'Chinese', '\U0001f310'),
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'), ('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'), ('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),

View File

@ -129,8 +129,8 @@ class TestLanguageParser(SearxTestCase):
query = RawTextQuery(':hu-H', []) query = RawTextQuery(':hu-H', [])
self.assertEqual(query.autocomplete_list, [":hu-hu"]) self.assertEqual(query.autocomplete_list, [":hu-hu"])
query = RawTextQuery(':v', []) query = RawTextQuery(':zh-', [])
self.assertEqual(query.autocomplete_list, [':vi', ':tiếng việt', ':việt_nam']) self.assertEqual(query.autocomplete_list, [':zh-cn', ':zh-hk', ':zh-tw'])
class TestTimeoutParser(SearxTestCase): class TestTimeoutParser(SearxTestCase):