searxng/searx/engines/sogou.py
2024-09-28 17:34:28 +03:00

72 lines
2.3 KiB
Python

from urllib.parse import urlencode
from lxml import html
from searx.utils import extract_text
about = {
"website": 'https://www.sogou.com/',
"results": 'HTML',
}
paging = True
base_url = 'https://www.sogou.com/web'
def request(query, params):
page = params.get('pageno', 1)
query_params = {
'query': query,
'page': page,
}
# Add the URL for the request
params['url'] = f'{base_url}?{urlencode(query_params)}'
# Custom headers for the request
headers = {
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Origin': 'https://translate.sogou.com',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': 'application/json',
'Referer': 'https://translate.sogou.com/',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
}
# Merge with any existing headers in params
if 'headers' in params:
params['headers'].update(headers)
else:
params['headers'] = headers
return params
def response(resp):
results = []
dom = html.fromstring(resp.text)
for result in dom.xpath('//div[@class="vrwrap"]'):
# Extract link
link_divs = result.xpath('.//div[contains(@class, "r-sech") and (contains(@class, "click-better-sugg") or contains(@class, "result_list"))]')
link = link_divs[0].xpath('./@data-url')[0] if link_divs else None
# Extract title
title_elem = result.xpath('.//h3[@class="vr-title"]/a') or result.xpath('.//div/h3/a')
title = title_elem[0].text_content().strip() if title_elem else None
# Extract content from multiple possible elements
content_elem = result.xpath('.//div[@class="fz-mid space-txt"]')
content = content_elem[0].text_content().strip() if content_elem else 'None'
if link or title:
results.append({
'url': link,
'title': title,
'content': content,
})
return results