From 87e7926ae96bc394427859c3688037c0d1710230 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Wed, 28 Jun 2023 09:16:49 +0200 Subject: [PATCH] [fix] engine: Anna's Archive - grep results from '.js-scroll-hidden' elements The renderuing of the WEB page is very strange; except the firts position all other positions of Anna's result page are enclosed in SGML comments. These cooments are *uncommented* by some JS code, see query of the class '.js-scroll-hidden' in Anna's HTML template [1]. [1] https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html Signed-off-by: Markus Heiser --- searx/engines/annas_archive.py | 42 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py index cebc8d45c..db9bd1719 100644 --- a/searx/engines/annas_archive.py +++ b/searx/engines/annas_archive.py @@ -97,14 +97,6 @@ aa_ext: str = '' """ -# xpath queries -xpath_results: str = '//main//a[starts-with(@href,"/md5")]' -xpath_url: str = ".//@href" -xpath_title: str = ".//h3/text()[1]" -xpath_authors: str = './/div[contains(@class, "italic")]' -xpath_publisher: str = './/div[contains(@class, "text-sm")]' -xpath_file_info: str = './/div[contains(@class, "text-xs")]' - def init(engine_settings=None): # pylint: disable=unused-argument """Check of engine's settings.""" @@ -131,24 +123,34 @@ def response(resp) -> List[Dict[str, Optional[str]]]: results: List[Dict[str, Optional[str]]] = [] dom = html.fromstring(resp.text) - for item in dom.xpath(xpath_results): - result: Dict[str, Optional[str]] = {} + for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'): + results.append(_get_result(item)) - result["url"] = base_url + item.xpath(xpath_url)[0] + # The rendering of the WEB page is very strange; except the first position + # all other positions of Anna's result page are enclosed in SGML comments. + # These comments are *uncommented* by some JS code, see query of class + # '.js-scroll-hidden' in Anna's HTML template: + # https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html - result["title"] = extract_text(eval_xpath(item, xpath_title)) - - result["content"] = "{publisher}. {authors}. {file_info}".format( - authors=extract_text(eval_xpath(item, xpath_authors)), - publisher=extract_text(eval_xpath(item, xpath_publisher)), - file_info=extract_text(eval_xpath(item, xpath_file_info)), - ) - - results.append(result) + for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'): + item = html.fromstring(item.xpath('./comment()')[0].text) + results.append(_get_result(item)) return results +def _get_result(item): + return { + 'template': 'paper.html', + 'url': base_url + item.xpath('./@href')[0], + 'title': extract_text(eval_xpath(item, './/h3/text()[1]')), + 'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')), + 'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))], + 'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')), + 'img_src': item.xpath('.//img/@src')[0], + } + + def fetch_traits(engine_traits: EngineTraits): """Fetch languages and other search arguments from Anna's search form.""" # pylint: disable=import-outside-toplevel