mirror of https://github.com/searxng/searxng.git
[fix] dailymotion engine: filter by language & country
- fix the issue of fetching more the 7000 *languages* - improve the request function and filter by language & country - implement time_range_support & safesearch - add more fields to the response from dailymotion (allow_embed, length) - better clean up of HTML tags in the 'content' field. This is more or less a complete rework based on the '/videos' API from [1]. This patch cleans up the language list in SearXNG that has been polluted by the ISO-639-3 2 and 3 letter codes from dailymotion languages which have never been used. [1] https://developers.dailymotion.com/tools/ Closes: https://github.com/searxng/searxng/issues/1065 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
27f8fa6fe0
commit
3bb62823ec
File diff suppressed because it is too large
Load Diff
|
@ -1,12 +1,17 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""
|
"""Dailymotion (Videos)
|
||||||
Dailymotion (Videos)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from json import loads
|
from typing import Set
|
||||||
from datetime import datetime
|
from datetime import datetime, timedelta
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from searx.utils import match_language, html_to_text
|
import time
|
||||||
|
import babel
|
||||||
|
|
||||||
|
from searx.exceptions import SearxEngineAPIException
|
||||||
|
from searx.network import raise_for_httperror
|
||||||
|
from searx.utils import html_to_text
|
||||||
|
|
||||||
# about
|
# about
|
||||||
about = {
|
about = {
|
||||||
|
@ -21,23 +26,78 @@ about = {
|
||||||
# engine dependent config
|
# engine dependent config
|
||||||
categories = ['videos']
|
categories = ['videos']
|
||||||
paging = True
|
paging = True
|
||||||
|
number_of_results = 10
|
||||||
|
|
||||||
|
time_range_support = True
|
||||||
|
time_delta_dict = {
|
||||||
|
"day": timedelta(days=1),
|
||||||
|
"week": timedelta(days=7),
|
||||||
|
"month": timedelta(days=31),
|
||||||
|
"year": timedelta(days=365),
|
||||||
|
}
|
||||||
|
|
||||||
|
safesearch = True
|
||||||
|
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
|
||||||
|
|
||||||
# search-url
|
# search-url
|
||||||
# see http://www.dailymotion.com/doc/api/obj-video.html
|
# - https://developers.dailymotion.com/tools/
|
||||||
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa
|
# - https://www.dailymotion.com/doc/api/obj-video.html
|
||||||
supported_languages_url = 'https://api.dailymotion.com/languages'
|
|
||||||
|
|
||||||
|
result_fields = [
|
||||||
|
'allow_embed',
|
||||||
|
'description',
|
||||||
|
'title',
|
||||||
|
'created_time',
|
||||||
|
'duration',
|
||||||
|
'url',
|
||||||
|
'thumbnail_360_url',
|
||||||
|
'id',
|
||||||
|
]
|
||||||
|
search_url = (
|
||||||
|
'https://api.dailymotion.com/videos?'
|
||||||
|
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
|
||||||
|
).format(
|
||||||
|
fields=','.join(result_fields),
|
||||||
|
password_protected= 'false',
|
||||||
|
private='false',
|
||||||
|
sort='relevance',
|
||||||
|
limit=number_of_results,
|
||||||
|
)
|
||||||
|
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
|
||||||
|
|
||||||
|
# The request query filters by 'languages' & 'country', therefore instead of
|
||||||
|
# fetching only languages we need to fetch locales.
|
||||||
|
supported_languages_url = 'https://api.dailymotion.com/locales'
|
||||||
|
|
||||||
# do search-request
|
|
||||||
def request(query, params):
|
def request(query, params):
|
||||||
if params['language'] == 'all':
|
|
||||||
locale = 'en-US'
|
|
||||||
else:
|
|
||||||
locale = match_language(params['language'], supported_languages)
|
|
||||||
|
|
||||||
params['url'] = search_url.format(
|
if not query:
|
||||||
query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno']
|
return False
|
||||||
)
|
|
||||||
|
language = params['language']
|
||||||
|
if language == 'all':
|
||||||
|
language = 'en-US'
|
||||||
|
locale = babel.Locale.parse(language, sep='-')
|
||||||
|
|
||||||
|
query_args = {
|
||||||
|
'search': query,
|
||||||
|
'languages': locale.language,
|
||||||
|
'page': params['pageno'],
|
||||||
|
}
|
||||||
|
|
||||||
|
if locale.territory:
|
||||||
|
localization = locale.language + '_' + locale.territory
|
||||||
|
if localization in supported_languages:
|
||||||
|
query_args['country'] = locale.territory
|
||||||
|
|
||||||
|
time_delta = time_delta_dict.get(params["time_range"])
|
||||||
|
if time_delta:
|
||||||
|
created_after = datetime.now() - time_delta
|
||||||
|
query_args['created_after'] = datetime.timestamp(created_after)
|
||||||
|
|
||||||
|
query_str = urlencode(query_args)
|
||||||
|
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
|
||||||
|
params['raise_for_httperror'] = False
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
@ -46,34 +106,51 @@ def request(query, params):
|
||||||
def response(resp):
|
def response(resp):
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
search_res = loads(resp.text)
|
search_res = resp.json()
|
||||||
|
|
||||||
# return empty array if there are no results
|
# check for an API error
|
||||||
if 'list' not in search_res:
|
if 'error' in search_res:
|
||||||
return []
|
raise SearxEngineAPIException(search_res['error'].get('message'))
|
||||||
|
|
||||||
|
raise_for_httperror(resp)
|
||||||
|
|
||||||
# parse results
|
# parse results
|
||||||
for res in search_res['list']:
|
for res in search_res.get('list', []):
|
||||||
|
|
||||||
title = res['title']
|
title = res['title']
|
||||||
url = res['url']
|
url = res['url']
|
||||||
|
|
||||||
content = html_to_text(res['description'])
|
content = html_to_text(res['description'])
|
||||||
thumbnail = res['thumbnail_360_url']
|
if len(content) > 300:
|
||||||
|
content = content[:300] + '...'
|
||||||
|
|
||||||
publishedDate = datetime.fromtimestamp(res['created_time'], None)
|
publishedDate = datetime.fromtimestamp(res['created_time'], None)
|
||||||
|
|
||||||
# http to https
|
length = time.gmtime(res.get('duration'))
|
||||||
|
if length.tm_hour:
|
||||||
|
length = time.strftime("%H:%M:%S", length)
|
||||||
|
else:
|
||||||
|
length = time.strftime("%M:%S", length)
|
||||||
|
|
||||||
|
thumbnail = res['thumbnail_360_url']
|
||||||
thumbnail = thumbnail.replace("http://", "https://")
|
thumbnail = thumbnail.replace("http://", "https://")
|
||||||
|
|
||||||
results.append(
|
item = {
|
||||||
{
|
|
||||||
'template': 'videos.html',
|
'template': 'videos.html',
|
||||||
'url': url,
|
'url': url,
|
||||||
'title': title,
|
'title': title,
|
||||||
'content': content,
|
'content': content,
|
||||||
'publishedDate': publishedDate,
|
'publishedDate': publishedDate,
|
||||||
'iframe_src': "https://www.dailymotion.com/embed/video/" + res['id'],
|
'length': length,
|
||||||
'thumbnail': thumbnail,
|
'thumbnail': thumbnail,
|
||||||
}
|
}
|
||||||
)
|
|
||||||
|
# HINT: no mater what the value is, without API token videos can't shown
|
||||||
|
# embedded
|
||||||
|
if res['allow_embed']:
|
||||||
|
item['iframe_src'] = iframe_src.format(video_id=res['id'])
|
||||||
|
|
||||||
|
results.append(item)
|
||||||
|
|
||||||
# return results
|
# return results
|
||||||
return results
|
return results
|
||||||
|
@ -81,18 +158,8 @@ def response(resp):
|
||||||
|
|
||||||
# get supported languages from their site
|
# get supported languages from their site
|
||||||
def _fetch_supported_languages(resp):
|
def _fetch_supported_languages(resp):
|
||||||
supported_languages = {}
|
response_json = resp.json()
|
||||||
|
return [
|
||||||
response_json = loads(resp.text)
|
item['locale']
|
||||||
|
for item in response_json['list']
|
||||||
for language in response_json['list']:
|
]
|
||||||
supported_languages[language['code']] = {}
|
|
||||||
|
|
||||||
name = language['native_name']
|
|
||||||
if name:
|
|
||||||
supported_languages[language['code']]['name'] = name
|
|
||||||
english_name = language['name']
|
|
||||||
if english_name:
|
|
||||||
supported_languages[language['code']]['english_name'] = english_name
|
|
||||||
|
|
||||||
return supported_languages
|
|
||||||
|
|
|
@ -2,9 +2,7 @@
|
||||||
# list of language codes
|
# list of language codes
|
||||||
# this file is generated automatically by utils/fetch_languages.py
|
# this file is generated automatically by utils/fetch_languages.py
|
||||||
language_codes = (
|
language_codes = (
|
||||||
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
|
|
||||||
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
|
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
|
||||||
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
|
|
||||||
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
|
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
|
||||||
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
|
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
|
||||||
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
|
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
|
||||||
|
@ -28,20 +26,15 @@ language_codes = (
|
||||||
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
|
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
|
||||||
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
|
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
|
||||||
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
|
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
|
||||||
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
|
|
||||||
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
|
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
|
||||||
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
|
|
||||||
('fr', 'Français', '', 'French', '\U0001f310'),
|
('fr', 'Français', '', 'French', '\U0001f310'),
|
||||||
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
|
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
|
||||||
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
|
||||||
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
|
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
|
||||||
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
|
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
|
||||||
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
|
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
|
||||||
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
|
|
||||||
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
|
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
|
||||||
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
|
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
|
||||||
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
|
|
||||||
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
|
|
||||||
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
|
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
|
||||||
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
|
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
|
||||||
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
|
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
|
||||||
|
@ -63,13 +56,10 @@ language_codes = (
|
||||||
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
|
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
|
||||||
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
|
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
|
||||||
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
|
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
|
||||||
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
|
|
||||||
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
|
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
|
||||||
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
|
|
||||||
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
|
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
|
||||||
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
|
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
|
||||||
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
|
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
|
||||||
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
|
|
||||||
('zh', '中文', '', 'Chinese', '\U0001f310'),
|
('zh', '中文', '', 'Chinese', '\U0001f310'),
|
||||||
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
|
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
|
||||||
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),
|
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),
|
||||||
|
|
|
@ -129,8 +129,8 @@ class TestLanguageParser(SearxTestCase):
|
||||||
query = RawTextQuery(':hu-H', [])
|
query = RawTextQuery(':hu-H', [])
|
||||||
self.assertEqual(query.autocomplete_list, [":hu-hu"])
|
self.assertEqual(query.autocomplete_list, [":hu-hu"])
|
||||||
|
|
||||||
query = RawTextQuery(':v', [])
|
query = RawTextQuery(':zh-', [])
|
||||||
self.assertEqual(query.autocomplete_list, [':vi', ':tiếng việt', ':việt_nam'])
|
self.assertEqual(query.autocomplete_list, [':zh-cn', ':zh-hk', ':zh-tw'])
|
||||||
|
|
||||||
|
|
||||||
class TestTimeoutParser(SearxTestCase):
|
class TestTimeoutParser(SearxTestCase):
|
||||||
|
|
Loading…
Reference in New Issue