Merge pull request #878 from tiekoetter/fix-wikidata

Fix wikidata info box images
This commit is contained in:
Markus Heiser 2022-02-07 10:43:36 +01:00 committed by GitHub
commit ae8e3f3543
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 59 additions and 12 deletions

View File

@ -1,10 +1,11 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Wikidata
""" """
Wikidata # pylint: disable=missing-class-docstring
"""
from hashlib import md5
from urllib.parse import urlencode from urllib.parse import urlencode, unquote
from json import loads from json import loads
from dateutil.parser import isoparse from dateutil.parser import isoparse
@ -185,7 +186,51 @@ def response(resp):
return results return results
_IMG_SRC_DEFAULT_URL_PREFIX = "https://commons.wikimedia.org/wiki/Special:FilePath/"
_IMG_SRC_NEW_URL_PREFIX = "https://upload.wikimedia.org/wikipedia/commons/thumb/"
def get_thumbnail(img_src):
"""Get Thumbnail image from wikimedia commons
Images from commons.wikimedia.org are (HTTP) redirected to
upload.wikimedia.org. The redirected URL can be calculated by this
function.
- https://stackoverflow.com/a/33691240
"""
logger.debug('get_thumbnail(): %s', img_src)
if not img_src is None and _IMG_SRC_DEFAULT_URL_PREFIX in img_src.split()[0]:
img_src_name = unquote(img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[0].replace("%20", "_"))
img_src_name_first = img_src_name
img_src_name_second = img_src_name
if ".svg" in img_src_name.split()[0]:
img_src_name_second = img_src_name + ".png"
img_src_size = img_src.replace(_IMG_SRC_DEFAULT_URL_PREFIX, "").split("?", 1)[1]
img_src_size = img_src_size[img_src_size.index("=") + 1 : img_src_size.index("&")]
img_src_name_md5 = md5(img_src_name.encode("utf-8")).hexdigest()
img_src = (
_IMG_SRC_NEW_URL_PREFIX
+ img_src_name_md5[0]
+ "/"
+ img_src_name_md5[0:2]
+ "/"
+ img_src_name_first
+ "/"
+ img_src_size
+ "px-"
+ img_src_name_second
)
logger.debug('get_thumbnail() redirected: %s', img_src)
return img_src
def get_results(attribute_result, attributes, language): def get_results(attribute_result, attributes, language):
# pylint: disable=too-many-branches
results = [] results = []
infobox_title = attribute_result.get('itemLabel') infobox_title = attribute_result.get('itemLabel')
infobox_id = attribute_result['item'] infobox_id = attribute_result['item']
@ -194,7 +239,7 @@ def get_results(attribute_result, attributes, language):
infobox_attributes = [] infobox_attributes = []
infobox_content = attribute_result.get('itemDescription', []) infobox_content = attribute_result.get('itemDescription', [])
img_src = None img_src = None
img_src_priority = 100 img_src_priority = 0
for attribute in attributes: for attribute in attributes:
value = attribute.get_str(attribute_result, language) value = attribute.get_str(attribute_result, language)
@ -220,8 +265,8 @@ def get_results(attribute_result, attributes, language):
# this attribute is an image. # this attribute is an image.
# replace the current image only the priority is lower # replace the current image only the priority is lower
# (the infobox contain only one image). # (the infobox contain only one image).
if attribute.priority < img_src_priority: if attribute.priority > img_src_priority:
img_src = value img_src = get_thumbnail(value)
img_src_priority = attribute.priority img_src_priority = attribute.priority
elif attribute_type == WDGeoAttribute: elif attribute_type == WDGeoAttribute:
# geocoordinate link # geocoordinate link
@ -278,6 +323,7 @@ def get_query(query, language):
def get_attributes(language): def get_attributes(language):
# pylint: disable=too-many-statements
attributes = [] attributes = []
def add_value(name): def add_value(name):
@ -418,7 +464,7 @@ def get_attributes(language):
class WDAttribute: class WDAttribute:
# pylint: disable=no-self-use
__slots__ = ('name',) __slots__ = ('name',)
def __init__(self, name): def __init__(self, name):
@ -439,7 +485,7 @@ class WDAttribute:
def get_group_by(self): def get_group_by(self):
return "" return ""
def get_str(self, result, language): def get_str(self, result, language): # pylint: disable=unused-argument
return result.get(self.name + 's') return result.get(self.name + 's')
def __repr__(self): def __repr__(self):
@ -580,6 +626,7 @@ class WDImageAttribute(WDURLAttribute):
class WDDateAttribute(WDAttribute): class WDDateAttribute(WDAttribute):
# pylint: disable=no-self-use
def get_select(self): def get_select(self):
return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name) return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name)
@ -600,7 +647,7 @@ class WDDateAttribute(WDAttribute):
def get_group_by(self): def get_group_by(self):
return self.get_select() return self.get_select()
def format_8(self, value, locale): def format_8(self, value, locale): # pylint: disable=unused-argument
# precision: less than a year # precision: less than a year
return value return value
@ -673,7 +720,7 @@ class WDDateAttribute(WDAttribute):
else: else:
value = t[0] value = t[0]
return format_method(value, language) return format_method(value, language)
except Exception: except Exception: # pylint: disable=broad-except
return value return value
return value return value
@ -687,7 +734,7 @@ def debug_explain_wikidata_query(query, method='GET'):
return http_response.content return http_response.content
def init(engine_settings=None): def init(engine_settings=None): # pylint: disable=unused-argument
# WIKIDATA_PROPERTIES : add unit symbols # WIKIDATA_PROPERTIES : add unit symbols
WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS)