[mod] declare TypedDict of weather forecast types.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
Markus Heiser 2022-09-19 18:26:52 +02:00
parent 627dfc2af6
commit e7ff672f3a
4 changed files with 388 additions and 0 deletions

432
searx/results/__init__.py Normal file
View file

@ -0,0 +1,432 @@
import re
from collections import defaultdict
from operator import itemgetter
from threading import RLock
from typing import List, NamedTuple, Set
from urllib.parse import urlparse, unquote
from searx import logger
from searx.engines import engines
from searx.metrics import histogram_observe, counter_add, count_error
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
# return the meaningful length of the content for a result
def result_content_len(content):
if isinstance(content, str):
return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
else:
return 0
def compare_urls(url_a, url_b):
"""Lazy compare between two URL.
"www.example.com" and "example.com" are equals.
"www.example.com/path/" and "www.example.com/path" are equals.
"https://www.example.com/" and "http://www.example.com/" are equals.
Args:
url_a (ParseResult): first URL
url_b (ParseResult): second URL
Returns:
bool: True if url_a and url_b are equals
"""
# ignore www. in comparison
if url_a.netloc.startswith('www.'):
host_a = url_a.netloc.replace('www.', '', 1)
else:
host_a = url_a.netloc
if url_b.netloc.startswith('www.'):
host_b = url_b.netloc.replace('www.', '', 1)
else:
host_b = url_b.netloc
if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
return False
# remove / from the end of the url if required
path_a = url_a.path[:-1] if url_a.path.endswith('/') else url_a.path
path_b = url_b.path[:-1] if url_b.path.endswith('/') else url_b.path
return unquote(path_a) == unquote(path_b)
def merge_two_infoboxes(infobox1, infobox2):
# get engines weights
if hasattr(engines[infobox1['engine']], 'weight'):
weight1 = engines[infobox1['engine']].weight
else:
weight1 = 1
if hasattr(engines[infobox2['engine']], 'weight'):
weight2 = engines[infobox2['engine']].weight
else:
weight2 = 1
if weight2 > weight1:
infobox1['engine'] = infobox2['engine']
infobox1['engines'] |= infobox2['engines']
if 'urls' in infobox2:
urls1 = infobox1.get('urls', None)
if urls1 is None:
urls1 = []
for url2 in infobox2.get('urls', []):
unique_url = True
parsed_url2 = urlparse(url2.get('url', ''))
entity_url2 = url2.get('entity')
for url1 in urls1:
if (entity_url2 is not None and url1.get('entity') == entity_url2) or compare_urls(
urlparse(url1.get('url', '')), parsed_url2
):
unique_url = False
break
if unique_url:
urls1.append(url2)
infobox1['urls'] = urls1
if 'img_src' in infobox2:
img1 = infobox1.get('img_src', None)
img2 = infobox2.get('img_src')
if img1 is None:
infobox1['img_src'] = img2
elif weight2 > weight1:
infobox1['img_src'] = img2
if 'attributes' in infobox2:
attributes1 = infobox1.get('attributes')
if attributes1 is None:
infobox1['attributes'] = attributes1 = []
attributeSet = set()
for attribute in attributes1:
label = attribute.get('label')
if label not in attributeSet:
attributeSet.add(label)
entity = attribute.get('entity')
if entity not in attributeSet:
attributeSet.add(entity)
for attribute in infobox2.get('attributes', []):
if attribute.get('label') not in attributeSet and attribute.get('entity') not in attributeSet:
attributes1.append(attribute)
if 'content' in infobox2:
content1 = infobox1.get('content', None)
content2 = infobox2.get('content', '')
if content1 is not None:
if result_content_len(content2) > result_content_len(content1):
infobox1['content'] = content2
else:
infobox1['content'] = content2
def result_score(result):
weight = 1.0
for result_engine in result['engines']:
if hasattr(engines[result_engine], 'weight'):
weight *= float(engines[result_engine].weight)
occurences = len(result['positions'])
return sum((occurences * weight) / position for position in result['positions'])
class Timing(NamedTuple):
engine: str
total: float
load: float
class UnresponsiveEngine(NamedTuple):
engine: str
error_type: str
suspended: bool
class ResultContainer:
"""docstring for ResultContainer"""
__slots__ = (
'_merged_results',
'infoboxes',
'suggestions',
'answers',
'corrections',
'_number_of_results',
'_closed',
'paging',
'unresponsive_engines',
'timings',
'redirect_url',
'engine_data',
'on_result',
'_lock',
)
def __init__(self):
super().__init__()
self._merged_results = []
self.infoboxes = []
self.suggestions = set()
self.answers = {}
self.corrections = set()
self._number_of_results = []
self.engine_data = defaultdict(dict)
self._closed = False
self.paging = False
self.unresponsive_engines: Set[UnresponsiveEngine] = set()
self.timings: List[Timing] = []
self.redirect_url = None
self.on_result = lambda _: True
self._lock = RLock()
def extend(self, engine_name, results):
if self._closed:
return
standard_result_count = 0
error_msgs = set()
for result in list(results):
result['engine'] = engine_name
if 'suggestion' in result and self.on_result(result):
self.suggestions.add(result['suggestion'])
elif 'answer' in result and self.on_result(result):
self.answers[result['answer']] = result
elif 'correction' in result and self.on_result(result):
self.corrections.add(result['correction'])
elif 'infobox' in result and self.on_result(result):
self._merge_infobox(result)
elif 'number_of_results' in result and self.on_result(result):
self._number_of_results.append(result['number_of_results'])
elif 'engine_data' in result and self.on_result(result):
self.engine_data[engine_name][result['key']] = result['engine_data']
elif 'url' in result:
# standard result (url, title, content)
if not self._is_valid_url_result(result, error_msgs):
continue
# normalize the result
self._normalize_url_result(result)
# call on_result call searx.search.SearchWithPlugins._on_result
# which calls the plugins
if not self.on_result(result):
continue
self.__merge_url_result(result, standard_result_count + 1)
standard_result_count += 1
elif self.on_result(result):
self.__merge_result_no_url(result, standard_result_count + 1)
standard_result_count += 1
if len(error_msgs) > 0:
for msg in error_msgs:
count_error(engine_name, 'some results are invalids: ' + msg, secondary=True)
if engine_name in engines:
histogram_observe(standard_result_count, 'engine', engine_name, 'result', 'count')
if not self.paging and standard_result_count > 0 and engine_name in engines and engines[engine_name].paging:
self.paging = True
def _merge_infobox(self, infobox):
add_infobox = True
infobox_id = infobox.get('id', None)
infobox['engines'] = set([infobox['engine']])
if infobox_id is not None:
parsed_url_infobox_id = urlparse(infobox_id)
with self._lock:
for existingIndex in self.infoboxes:
if compare_urls(urlparse(existingIndex.get('id', '')), parsed_url_infobox_id):
merge_two_infoboxes(existingIndex, infobox)
add_infobox = False
if add_infobox:
self.infoboxes.append(infobox)
def _is_valid_url_result(self, result, error_msgs):
if 'url' in result:
if not isinstance(result['url'], str):
logger.debug('result: invalid URL: %s', str(result))
error_msgs.add('invalid URL')
return False
if 'title' in result and not isinstance(result['title'], str):
logger.debug('result: invalid title: %s', str(result))
error_msgs.add('invalid title')
return False
if 'content' in result:
if not isinstance(result['content'], str):
logger.debug('result: invalid content: %s', str(result))
error_msgs.add('invalid content')
return False
return True
def _normalize_url_result(self, result):
"""Return True if the result is valid"""
result['parsed_url'] = urlparse(result['url'])
# if the result has no scheme, use http as default
if not result['parsed_url'].scheme:
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
result['url'] = result['parsed_url'].geturl()
# avoid duplicate content between the content and title fields
if result.get('content') == result.get('title'):
del result['content']
# make sure there is a template
if 'template' not in result:
result['template'] = 'default.html'
# strip multiple spaces and cariage returns from content
if result.get('content'):
result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
def __merge_url_result(self, result, position):
result['engines'] = set([result['engine']])
with self._lock:
duplicated = self.__find_duplicated_http_result(result)
if duplicated:
self.__merge_duplicated_http_result(duplicated, result, position)
return
# if there is no duplicate found, append result
result['positions'] = [position]
self._merged_results.append(result)
def __find_duplicated_http_result(self, result):
result_template = result.get('template')
for merged_result in self._merged_results:
if 'parsed_url' not in merged_result:
continue
if compare_urls(result['parsed_url'], merged_result['parsed_url']) and result_template == merged_result.get(
'template'
):
if result_template != 'images.html':
# not an image, same template, same url : it's a duplicate
return merged_result
else:
# it's an image
# it's a duplicate if the parsed_url, template and img_src are differents
if result.get('img_src', '') == merged_result.get('img_src', ''):
return merged_result
return None
def __merge_duplicated_http_result(self, duplicated, result, position):
# using content with more text
if result_content_len(result.get('content', '')) > result_content_len(duplicated.get('content', '')):
duplicated['content'] = result['content']
# merge all result's parameters not found in duplicate
for key in result.keys():
if not duplicated.get(key):
duplicated[key] = result.get(key)
# add the new position
duplicated['positions'].append(position)
# add engine to list of result-engines
duplicated['engines'].add(result['engine'])
# using https if possible
if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
duplicated['url'] = result['parsed_url'].geturl()
duplicated['parsed_url'] = result['parsed_url']
def __merge_result_no_url(self, result, position):
result['engines'] = set([result['engine']])
result['positions'] = [position]
with self._lock:
self._merged_results.append(result)
def close(self):
self._closed = True
for result in self._merged_results:
score = result_score(result)
result['score'] = score
for result_engine in result['engines']:
counter_add(score, 'engine', result_engine, 'score')
results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
# pass 2 : group results by category and template
gresults = []
categoryPositions = {}
for res in results:
# FIXME : handle more than one category per engine
engine = engines[res['engine']]
res['category'] = engine.categories[0] if len(engine.categories) > 0 else ''
# FIXME : handle more than one category per engine
category = (
res['category']
+ ':'
+ res.get('template', '')
+ ':'
+ ('img_src' if 'img_src' in res or 'thumbnail' in res else '')
)
current = None if category not in categoryPositions else categoryPositions[category]
# group with previous results using the same category
# if the group can accept more result and is not too far
# from the current position
if current is not None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
# group with the previous results using
# the same category with this one
index = current['index']
gresults.insert(index, res)
# update every index after the current one
# (including the current one)
for k in categoryPositions:
v = categoryPositions[k]['index']
if v >= index:
categoryPositions[k]['index'] = v + 1
# update this category
current['count'] -= 1
else:
# same category
gresults.append(res)
# update categoryIndex
categoryPositions[category] = {'index': len(gresults), 'count': 8}
# update _merged_results
self._merged_results = gresults
def get_ordered_results(self):
if not self._closed:
self.close()
return self._merged_results
def results_length(self):
return len(self._merged_results)
def results_number(self):
resultnum_sum = sum(self._number_of_results)
if not resultnum_sum or not self._number_of_results:
return 0
return resultnum_sum / len(self._number_of_results)
def add_unresponsive_engine(self, engine_name: str, error_type: str, suspended: bool = False):
if engines[engine_name].display_error_messages:
self.unresponsive_engines.add(UnresponsiveEngine(engine_name, error_type, suspended))
def add_timing(self, engine_name: str, engine_time: float, page_load_time: float):
self.timings.append(Timing(engine_name, total=engine_time, load=page_load_time))
def get_timings(self):
return self.timings

379
searx/results/weather.py Normal file
View file

@ -0,0 +1,379 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Model of weather and forecast data. For more information about Climate and
Forecast (CF) visit:
- `NetCDF Climate and Forecast (CF) Metadata Conventions`_
- `Units of measure (CF)`_
- `CF Standard Name Table`_
.. _NetCDF Climate and Forecast (CF) Metadata Conventions:
https://cfconventions.org/cf-conventions/cf-conventions.html
.. _Units of measure (CF):
https://github.com/SciTools/cf-units
.. _CF Standard Name Table:
https://cfconventions.org/Data/cf-standard-names/29/build/cf-standard-name-table.html
.. sidebar:: met.no API
Example of a get_complete_ request to get weather data of Paris: `api.met.no
(lat:48.86 lon:2.35)`_
The weather data types :py:obj:`WeatherInstantType`, :py:obj:`WeatherPeriodType`
and :py:obj:`WeatherSummaryType` are based on the types of the WeatherAPI_ from
the `Norwegian Meteorological Institute`_ (aka met.no).
.. _Norwegian Meteorological Institute: https://www.met.no/en
.. _WeatherAPI: https://api.met.no/doc/
.. _`api.met.no (lat:48.86 lon:2.35)`: https://api.met.no/weatherapi/locationforecast/2.0/complete?lat=48.86&lon=2.35
.. _get_complete: https://api.met.no/weatherapi/locationforecast/2.0#!/data/get_complete
.. admonition:: data model is still under construction
This weather data model is in a very early stage. Declare inheritance of
weather data types and assemble a result type for a result item of a
weather-engine needs far more experience. We should not finish the model
before we have more than one weather engine.
In the mean time the weather return item is described in :ref:`engine weather
media types`.
TypedDict declarations
----------------------
Declared data types for `dict` types returned by weather engines:
.. inheritance-diagram:: WeatherSummaryType
:caption: Weather type that sums weather condition for a specific time period.
.. inheritance-diagram:: WeatherInstantType
:caption: Type of weather data valid for a specific point in time.
.. inheritance-diagram:: WeatherPeriodType
:caption: Type of weather data valid for a specific time period.
.. inheritance-diagram:: TemperatureType
:caption: Type of thermodynamic temperature.
Classe definitions
------------------
.. inheritance-diagram:: WeatherSummary
:caption: class of objects build from :py:class:`WeatherSummaryType`
"""
# pylint: disable=too-few-public-methods
from typing import Union
from typing_extensions import TypedDict, NotRequired
class TemperatureType(TypedDict):
"""Units of thermodynamic temperature. A value of ``Null`` is equivalent to
*unset*.
"""
# pylint: disable=invalid-name
K: NotRequired[Union[float, None]]
"""Temperature (unit Kelvin °K)"""
C: NotRequired[Union[float, None]]
"""Temperature (unit Celsius °C)
Unit (and scale) of temperature, with same magnitude as the Kelvin and a
zero-point offset of 273.15
"""
F: NotRequired[Union[float, None]]
"""Temperature (unit Fahrenheit °F)
Unit of thermodynamic temperature (°R @ 459.67).
"""
class WeatherSummaryType(TypedDict):
"""Data type of a :py:obj:`WeatherSummary`"""
symbol_code: str
"""A `list of symbols`_ is available from `Yr weather symbols`_
.. _Yr weather symbols: https://nrkno.github.io/yr-weather-symbols/
.. _list of symbols: https://api.met.no/weatherapi/weathericon/2.0/documentation#List_of_symbols
"""
class WeatherSummary:
"""A identifier that sums up the weather condition for *this* time period."""
# https://api.met.no/weatherapi/weathericon/2.0/legends
legend = {
"clearsky": {
"desc_en": "Clear sky",
"variants": ["day", "night", "polartwilight"],
},
"cloudy": {"desc_en": "Cloudy", "variants": []},
"fair": {
"desc_en": "Fair",
"variants": ["day", "night", "polartwilight"],
},
"fog": {"desc_en": "Fog", "variants": []},
"heavyrain": {
"desc_en": "Heavy rain",
"variants": [],
},
"heavyrainandthunder": {
"desc_en": "Heavy rain and thunder",
"variants": [],
},
"heavyrainshowers": {
"desc_en": "Heavy rain showers",
"variants": ["day", "night", "polartwilight"],
},
"heavyrainshowersandthunder": {
"desc_en": "Heavy rain showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"heavysleet": {
"desc_en": "Heavy sleet",
"variants": [],
},
"heavysleetandthunder": {
"desc_en": "Heavy sleet and thunder",
"variants": [],
},
"heavysleetshowers": {
"desc_en": "Heavy sleet showers",
"variants": ["day", "night", "polartwilight"],
},
"heavysleetshowersandthunder": {
"desc_en": "Heavy sleet showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"heavysnow": {
"desc_en": "Heavy snow",
"variants": [],
},
"heavysnowandthunder": {
"desc_en": "Heavy snow and thunder",
"variants": [],
},
"heavysnowshowers": {
"desc_en": "Heavy snow showers",
"variants": ["day", "night", "polartwilight"],
},
"heavysnowshowersandthunder": {
"desc_en": "Heavy snow showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"lightrain": {
"desc_en": "Light rain",
"variants": [],
},
"lightrainandthunder": {
"desc_en": "Light rain and thunder",
"variants": [],
},
"lightrainshowers": {
"desc_en": "Light rain showers",
"variants": ["day", "night", "polartwilight"],
},
"lightrainshowersandthunder": {
"desc_en": "Light rain showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"lightsleet": {
"desc_en": "Light sleet",
"variants": [],
},
"lightsleetandthunder": {
"desc_en": "Light sleet and thunder",
"variants": [],
},
"lightsleetshowers": {
"desc_en": "Light sleet showers",
"variants": ["day", "night", "polartwilight"],
},
"lightsnow": {
"desc_en": "Light snow",
"variants": [],
},
"lightsnowandthunder": {
"desc_en": "Light snow and thunder",
"variants": [],
},
"lightsnowshowers": {
"desc_en": "Light snow showers",
"variants": ["day", "night", "polartwilight"],
},
"lightssleetshowersandthunder": {
"desc_en": "Light sleet showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"lightssnowshowersandthunder": {
"desc_en": "Light snow showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"partlycloudy": {
"desc_en": "Partly cloudy",
"variants": ["day", "night", "polartwilight"],
},
"rain": {"desc_en": "Rain", "variants": []},
"rainandthunder": {
"desc_en": "Rain and thunder",
"variants": [],
},
"rainshowers": {
"desc_en": "Rain showers",
"variants": ["day", "night", "polartwilight"],
},
"rainshowersandthunder": {
"desc_en": "Rain showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"sleet": {"desc_en": "Sleet", "variants": []},
"sleetandthunder": {
"desc_en": "Sleet and thunder",
"variants": [],
},
"sleetshowers": {
"desc_en": "Sleet showers",
"variants": ["day", "night", "polartwilight"],
},
"sleetshowersandthunder": {
"desc_en": "Sleet showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
"snow": {"desc_en": "Snow", "variants": []},
"snowandthunder": {
"desc_en": "Snow and thunder",
"variants": [],
},
"snowshowers": {
"desc_en": "Snow showers",
"variants": ["day", "night", "polartwilight"],
},
"snowshowersandthunder": {
"desc_en": "Snow showers and thunder",
"variants": ["day", "night", "polartwilight"],
},
}
"""Legend of the `Yr weather symbols`_. The *key* is the name of the icon and
the value is a dict with a legend and a list of variants of this icon.
.. code::
"clearsky": {
"desc_en": "Clear sky",
"variants": ["day", "night", "polartwilight"],
},
:meta hide-value:
"""
def __init__(self, data: WeatherSummaryType):
self.data = data
class WeatherInstantType(TypedDict):
"""Weather parameters valid for a specific point in time (in the instant case).
A value of ``Null`` is equivalent to *unset*.
"""
air_pressure_at_sea_level: NotRequired[Union[float, None]]
"""Air pressure at sea level (unit hPa)"""
air_temperature: NotRequired[Union[TemperatureType, None]]
"""Air temperature at 2m above the ground"""
air_temperature_percentile_10: NotRequired[Union[TemperatureType, None]]
"""10th percentile of air temperature (i.e 90% chance it will be above this
value)"""
air_temperature_percentile_90: NotRequired[Union[TemperatureType, None]]
"""90th percentile of air temperature (i.e 10% chance it will be above this
value)"""
cloud_area_fraction: NotRequired[Union[float, None]]
"""Amount of sky covered by clouds / total cloud cover for all heights
(cloudiness, unit: %)"""
cloud_area_fraction_high: NotRequired[Union[float, None]]
"""Amount of sky covered by clouds at high elevation / cloud cover higher than
5000m above the ground (cloudiness, unit: %)"""
cloud_area_fraction_low: NotRequired[Union[float, None]]
"""Amount of sky covered by clouds at low elevation / cloud cover lower than
2000m above the ground (cloudiness, unit: %)"""
cloud_area_fraction_medium: NotRequired[Union[float, None]]
"""Amount of sky covered by clouds at medium elevation / cloud cover between
2000 and 5000m above the ground (cloudiness, unit: %)"""
dew_point_temperature: NotRequired[Union[TemperatureType, None]]
"""Dew point temperature at sea level / dew point temperature 2m above the
ground."""
fog_area_fraction: NotRequired[Union[float, None]]
"""Amount of area covered by fog / amount of surrounding area covered in fog
(horizontal view under a 1000 meters, unit: %)"""
relative_humidity: NotRequired[Union[float, None]]
"""Amount of humidity in the air / relative humidity at 2m above the ground
(unit: %))"""
wind_from_direction: NotRequired[Union[float, None]]
"""The directon which moves towards / direction the wind is coming from (unit:
degrees, 0° is north, 90° east, etc.)"""
wind_speed: NotRequired[Union[float, None]]
"""Speed of wind / wind speed at 10m above the ground (10 min average, unit:
m/s)"""
wind_speed_of_gust: NotRequired[Union[float, None]]
"""Speed of wind gust / maximum gust for period at 10m above the ground. Gust is
wind speed averaged over 3s."""
wind_speed_percentile_10: NotRequired[Union[float, None]]
"""10th percentile of wind speed at 10m above the ground (10 min average, unit
m/s)"""
wind_speed_percentile_90: NotRequired[Union[float, None]]
"""90th percentile of wind speed at 10m above the ground (10 min average,
unit m/s)"""
class WeatherPeriodType(TypedDict):
"""Weather parameters valid for a specified time period. A value of ``Null`` is
equivalent to *unset*.
"""
air_temperature_max: NotRequired[Union[TemperatureType, None]]
"""Maximum air temperature in period."""
air_temperature_min: NotRequired[Union[TemperatureType, None]]
"""Minimum air temperature in period."""
precipitation_amount: NotRequired[Union[float, None]]
"""Best estimate for amount of precipitation for this period (unit: mm)."""
precipitation_amount_max: NotRequired[Union[float, None]]
"""Maximum amount of precipitation for this period (unit: mm)."""
precipitation_amount_min: NotRequired[Union[float, None]]
"""Minimum amount of precipitation for this period (unit: mm)."""
probability_of_precipitation: NotRequired[Union[float, None]]
"""Probability of any precipitation coming for this period / chance of
precipitation during period (unit: %)."""
probability_of_thunder: NotRequired[Union[float, None]]
"""Probability of any thunder coming for this period / chance of thunder during
period (unit: %)."""
ultraviolet_index_clear_sky_max: NotRequired[Union[int, None]]
"""Maximum ultraviolet index if sky is clear / Index for cloud free conditions;
0 (low) to 11+ (extreme)."""