[fix] ina engine

based on a45408e8e2
This commit is contained in:
Alexandre Flament 2022-01-28 22:13:48 +01:00
parent 8f100d7046
commit 116802852d
1 changed files with 18 additions and 31 deletions

View File

@ -3,12 +3,10 @@
INA (Videos) INA (Videos)
""" """
from json import loads
from html import unescape from html import unescape
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from lxml import html
from dateutil import parser from searx.utils import extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
from searx.utils import extract_text
# about # about
about = { about = {
@ -24,25 +22,24 @@ about = {
# engine dependent config # engine dependent config
categories = ['videos'] categories = ['videos']
paging = True paging = True
page_size = 48 page_size = 12
# search-url # search-url
base_url = 'https://www.ina.fr' base_url = 'https://www.ina.fr'
search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' search_url = base_url + '/ajax/recherche?{query}&espace=1&sort=pertinence&order=desc&offset={start}&modified=size'
# specific xpath variables # specific xpath variables
results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' results_xpath = '//div[@id="searchHits"]/div'
url_xpath = './/a/@href' url_xpath = './/a/@href'
title_xpath = './/h3[@class="h3--title media-heading"]' title_xpath = './/div[contains(@class,"title-bloc-small")]'
thumbnail_xpath = './/img/@src' content_xpath = './/div[contains(@class,"sous-titre-fonction")]'
publishedDate_xpath = './/span[@class="broadcast"]' thumbnail_xpath = './/img/@data-src'
content_xpath = './/p[@class="media-body__summary"]' publishedDate_xpath = './/div[contains(@class,"dateAgenda")]'
# do search-request # do search-request
def request(query, params): def request(query, params):
params['url'] = search_url.format(ps=page_size, start=params['pageno'] * page_size, query=urlencode({'q': query})) params['url'] = search_url.format(start=params['pageno'] * page_size, query=urlencode({'q': query}))
return params return params
@ -51,26 +48,17 @@ def response(resp):
results = [] results = []
# we get html in a JSON container... # we get html in a JSON container...
response = loads(resp.text) dom = html.fromstring(resp.text)
dom = html.fromstring(response)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath_list(dom, results_xpath):
videoid = result.xpath(url_xpath)[0] url_relative = eval_xpath_getindex(result, url_xpath, 0)
url = base_url + videoid url = base_url + url_relative
title = unescape(extract_text(result.xpath(title_xpath))) title = unescape(extract_text(eval_xpath(result, title_xpath)))
try: thumbnail = extract_text(eval_xpath(result, thumbnail_xpath))
thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) content = extract_text(eval_xpath(result, publishedDate_xpath)) + extract_text(
except: eval_xpath(result, content_xpath)
thumbnail = '' )
if thumbnail and thumbnail[0] == '/':
thumbnail = base_url + thumbnail
d = extract_text(result.xpath(publishedDate_xpath)[0])
d = d.split('/')
# force ISO date to avoid wrong parsing
d = "%s-%s-%s" % (d[2], d[1], d[0])
publishedDate = parser.parse(d)
content = extract_text(result.xpath(content_xpath))
# append result # append result
results.append( results.append(
@ -79,7 +67,6 @@ def response(resp):
'title': title, 'title': title,
'content': content, 'content': content,
'template': 'videos.html', 'template': 'videos.html',
'publishedDate': publishedDate,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
} }
) )