Merge pull request #1071 from return42/fix-lang-dailymotion

[fix] dailymotion engine: filter by language & country
This commit is contained in:
Alexandre Flament 2022-04-16 11:54:49 +02:00 committed by GitHub
commit c474616642
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 202 additions and 23668 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,17 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Dailymotion (Videos)
"""Dailymotion (Videos)
"""
from json import loads
from datetime import datetime
from typing import Set
from datetime import datetime, timedelta
from urllib.parse import urlencode
from searx.utils import match_language, html_to_text
import time
import babel
from searx.exceptions import SearxEngineAPIException
from searx.network import raise_for_httperror
from searx.utils import html_to_text
# about
about = {
@ -21,23 +26,89 @@ about = {
# engine dependent config
categories = ['videos']
paging = True
number_of_results = 10
time_range_support = True
time_delta_dict = {
"day": timedelta(days=1),
"week": timedelta(days=7),
"month": timedelta(days=31),
"year": timedelta(days=365),
}
safesearch = True
safesearch_params = {2: '&is_created_for_kids=true', 1: '&is_created_for_kids=true', 0: ''}
# search-url
# see http://www.dailymotion.com/doc/api/obj-video.html
search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,description,duration,url,thumbnail_360_url,id&sort=relevance&limit=5&page={pageno}&{query}' # noqa
supported_languages_url = 'https://api.dailymotion.com/languages'
# - https://developers.dailymotion.com/tools/
# - https://www.dailymotion.com/doc/api/obj-video.html
result_fields = [
'allow_embed',
'description',
'title',
'created_time',
'duration',
'url',
'thumbnail_360_url',
'id',
]
search_url = (
'https://api.dailymotion.com/videos?'
'fields={fields}&password_protected={password_protected}&private={private}&sort={sort}&limit={limit}'
).format(
fields=','.join(result_fields),
password_protected= 'false',
private='false',
sort='relevance',
limit=number_of_results,
)
iframe_src = "https://www.dailymotion.com/embed/video/{video_id}"
# The request query filters by 'languages' & 'country', therefore instead of
# fetching only languages we need to fetch locales.
supported_languages_url = 'https://api.dailymotion.com/locales'
supported_languages_iso639: Set[str] = set()
def init(_engine_settings):
global supported_languages_iso639
supported_languages_iso639 = set([language.split('_')[0] for language in supported_languages])
# do search-request
def request(query, params):
if params['language'] == 'all':
locale = 'en-US'
else:
locale = match_language(params['language'], supported_languages)
params['url'] = search_url.format(
query=urlencode({'search': query, 'localization': locale}), pageno=params['pageno']
)
if not query:
return False
language = params['language']
if language == 'all':
language = 'en-US'
locale = babel.Locale.parse(language, sep='-')
language_iso639 = locale.language
if locale.language not in supported_languages_iso639:
language_iso639 = 'en'
query_args = {
'search': query,
'languages': language_iso639,
'page': params['pageno'],
}
if locale.territory:
localization = locale.language + '_' + locale.territory
if localization in supported_languages:
query_args['country'] = locale.territory
time_delta = time_delta_dict.get(params["time_range"])
if time_delta:
created_after = datetime.now() - time_delta
query_args['created_after'] = datetime.timestamp(created_after)
query_str = urlencode(query_args)
params['url'] = search_url + '&' + query_str + safesearch_params.get(params['safesearch'], '')
params['raise_for_httperror'] = False
return params
@ -46,34 +117,51 @@ def request(query, params):
def response(resp):
results = []
search_res = loads(resp.text)
search_res = resp.json()
# return empty array if there are no results
if 'list' not in search_res:
return []
# check for an API error
if 'error' in search_res:
raise SearxEngineAPIException(search_res['error'].get('message'))
raise_for_httperror(resp)
# parse results
for res in search_res['list']:
for res in search_res.get('list', []):
title = res['title']
url = res['url']
content = html_to_text(res['description'])
thumbnail = res['thumbnail_360_url']
if len(content) > 300:
content = content[:300] + '...'
publishedDate = datetime.fromtimestamp(res['created_time'], None)
# http to https
length = time.gmtime(res.get('duration'))
if length.tm_hour:
length = time.strftime("%H:%M:%S", length)
else:
length = time.strftime("%M:%S", length)
thumbnail = res['thumbnail_360_url']
thumbnail = thumbnail.replace("http://", "https://")
results.append(
{
'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'iframe_src': "https://www.dailymotion.com/embed/video/" + res['id'],
'thumbnail': thumbnail,
}
)
item = {
'template': 'videos.html',
'url': url,
'title': title,
'content': content,
'publishedDate': publishedDate,
'length': length,
'thumbnail': thumbnail,
}
# HINT: no mater what the value is, without API token videos can't shown
# embedded
if res['allow_embed']:
item['iframe_src'] = iframe_src.format(video_id=res['id'])
results.append(item)
# return results
return results
@ -81,18 +169,8 @@ def response(resp):
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = {}
response_json = loads(resp.text)
for language in response_json['list']:
supported_languages[language['code']] = {}
name = language['native_name']
if name:
supported_languages[language['code']]['name'] = name
english_name = language['name']
if english_name:
supported_languages[language['code']]['english_name'] = english_name
return supported_languages
response_json = resp.json()
return [
item['locale']
for item in response_json['list']
]

View File

@ -2,9 +2,7 @@
# list of language codes
# this file is generated automatically by utils/fetch_languages.py
language_codes = (
('af-ZA', 'Afrikaans', 'Suid-Afrika', 'Afrikaans', '\U0001f1ff\U0001f1e6'),
('ar-EG', 'العربية', 'مصر', 'Arabic', '\U0001f1ea\U0001f1ec'),
('be-BY', 'Беларуская', 'Беларусь', 'Belarusian', '\U0001f1e7\U0001f1fe'),
('bg-BG', 'Български', 'България', 'Bulgarian', '\U0001f1e7\U0001f1ec'),
('ca-ES', 'Català', 'Espanya', 'Catalan', '\U0001f1ea\U0001f1f8'),
('cs-CZ', 'Čeština', 'Česko', 'Czech', '\U0001f1e8\U0001f1ff'),
@ -28,20 +26,15 @@ language_codes = (
('es-ES', 'Español', 'España', 'Spanish', '\U0001f1ea\U0001f1f8'),
('es-MX', 'Español', 'México', 'Spanish', '\U0001f1f2\U0001f1fd'),
('et-EE', 'Eesti', 'Eesti', 'Estonian', '\U0001f1ea\U0001f1ea'),
('fa-IR', 'فارسی', 'ایران', 'Persian', '\U0001f1ee\U0001f1f7'),
('fi-FI', 'Suomi', 'Suomi', 'Finnish', '\U0001f1eb\U0001f1ee'),
('fil-PH', 'Filipino', 'Pilipinas', 'Filipino', '\U0001f1f5\U0001f1ed'),
('fr', 'Français', '', 'French', '\U0001f310'),
('fr-BE', 'Français', 'Belgique', 'French', '\U0001f1e7\U0001f1ea'),
('fr-CA', 'Français', 'Canada', 'French', '\U0001f1e8\U0001f1e6'),
('fr-CH', 'Français', 'Suisse', 'French', '\U0001f1e8\U0001f1ed'),
('fr-FR', 'Français', 'France', 'French', '\U0001f1eb\U0001f1f7'),
('he-IL', 'עברית', 'ישראל', 'Hebrew', '\U0001f1ee\U0001f1f1'),
('hi-IN', 'हिन्दी', 'भारत', 'Hindi', '\U0001f1ee\U0001f1f3'),
('hr-HR', 'Hrvatski', 'Hrvatska', 'Croatian', '\U0001f1ed\U0001f1f7'),
('hu-HU', 'Magyar', 'Magyarország', 'Hungarian', '\U0001f1ed\U0001f1fa'),
('id-ID', 'Indonesia', 'Indonesia', 'Indonesian', '\U0001f1ee\U0001f1e9'),
('is-IS', 'Íslenska', 'Ísland', 'Icelandic', '\U0001f1ee\U0001f1f8'),
('it-IT', 'Italiano', 'Italia', 'Italian', '\U0001f1ee\U0001f1f9'),
('ja-JP', '日本語', '日本', 'Japanese', '\U0001f1ef\U0001f1f5'),
('ko-KR', '한국어', '대한민국', 'Korean', '\U0001f1f0\U0001f1f7'),
@ -63,13 +56,10 @@ language_codes = (
('ru-RU', 'Русский', 'Россия', 'Russian', '\U0001f1f7\U0001f1fa'),
('sk-SK', 'Slovenčina', 'Slovensko', 'Slovak', '\U0001f1f8\U0001f1f0'),
('sl-SI', 'Slovenščina', 'Slovenija', 'Slovenian', '\U0001f1f8\U0001f1ee'),
('sr-RS', 'Српски', 'Србија', 'Serbian', '\U0001f1f7\U0001f1f8'),
('sv-SE', 'Svenska', 'Sverige', 'Swedish', '\U0001f1f8\U0001f1ea'),
('sw-TZ', 'Kiswahili', 'Tanzania', 'Swahili', '\U0001f1f9\U0001f1ff'),
('th-TH', 'ไทย', 'ไทย', 'Thai', '\U0001f1f9\U0001f1ed'),
('tr-TR', 'Türkçe', 'Türkiye', 'Turkish', '\U0001f1f9\U0001f1f7'),
('uk-UA', 'Українська', 'Україна', 'Ukrainian', '\U0001f1fa\U0001f1e6'),
('vi-VN', 'Tiếng Việt', 'Việt Nam', 'Vietnamese', '\U0001f1fb\U0001f1f3'),
('zh', '中文', '', 'Chinese', '\U0001f310'),
('zh-CN', '中文', '中国', 'Chinese', '\U0001f1e8\U0001f1f3'),
('zh-HK', '中文', '中國香港特別行政區', 'Chinese', '\U0001f1ed\U0001f1f0'),

View File

@ -88,6 +88,8 @@ class _HTMLTextExtractor(HTMLParser): # pylint: disable=W0223 # (see https://b
def handle_starttag(self, tag, attrs):
self.tags.append(tag)
if tag == 'br':
self.result.append(' ')
def handle_endtag(self, tag):
if not self.tags:
@ -142,7 +144,7 @@ def html_to_text(html_str: str) -> str:
>>> html_to_text('<style>.span { color: red; }</style><span>Example</span>')
'Example'
"""
html_str = html_str.replace('\n', ' ')
html_str = html_str.replace('\n', ' ').replace('\r', ' ')
html_str = ' '.join(html_str.split())
s = _HTMLTextExtractor()
try:

View File

@ -129,8 +129,8 @@ class TestLanguageParser(SearxTestCase):
query = RawTextQuery(':hu-H', [])
self.assertEqual(query.autocomplete_list, [":hu-hu"])
query = RawTextQuery(':v', [])
self.assertEqual(query.autocomplete_list, [':vi', ':tiếng việt', ':việt_nam'])
query = RawTextQuery(':zh-', [])
self.assertEqual(query.autocomplete_list, [':zh-cn', ':zh-hk', ':zh-tw'])
class TestTimeoutParser(SearxTestCase):