From 1ceacf5fe9227cedacba13e72773a98b6a9d6267 Mon Sep 17 00:00:00 2001 From: Denis Shaposhnikov <993498+dsh2dsh@users.noreply.github.com> Date: Sat, 21 May 2022 18:33:44 +0200 Subject: [PATCH] bing: extract original url instead of url tracked by bing Bing returns URLs like ``` https://www.bing.com/ck/a?!&&p=7b6f95ee4bc34febe56210eec479fa7a84a991257e9773fda5e753ff482f9068JmltdHM9MTY1MzE1MDgzNSZpZ3VpZD0yYTZkNWQ4Yi05MDcwLTRkOGEtYWRmNi1jNWI2M2Y1NjJlOGQmaW5zaWQ9NTE1NA&ptn=3&fclid=ce60cbc5-d923-11ec-b22d-0e153102d4e8&u=a1aHR0cHM6Ly9kb2NzLnNlYXJ4bmcub3JnLw&ntb=1 ``` for tracking clicks. Looking into HTML source I found bing stores original URLs in "cite" element. Lets use it instead of. --- searx/engines/bing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 4c037de85..7fe3991e0 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -96,7 +96,10 @@ def response(resp): for result in eval_xpath(dom, '//li[@class="b_algo"]'): link = eval_xpath(result, './/h2/a')[0] - url = link.attrib.get('href') + # url = link.attrib.get('href') + # href attr is encoded by bing and directs back to bing for tracking + # instead of original. Lets extract original URL. + url = extract_text(eval_xpath(result, './/div[@class="b_attribution"]/cite')) title = extract_text(link) content = extract_text(eval_xpath(result, './/p'))