Merge pull request #1 from asciimoo/master

-
2024-01-01 19:24:07 +01:00 · 2017-10-25 10:44:28 +02:00 · 2017-10-25 10:44:28 +02:00 · d800e3fcfa
commit d800e3fcfa
parent 18a4e7035f b34124fd8a
49 changed files with 754 additions and 1473 deletions
--- a/searx/engines/bing_images.py
+++ b/searx/engines/bing_images.py
@ -18,7 +18,6 @@
 from lxml import html
 from json import loads
 import re
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
 from searx.url_utils import urlencode

 # engine dependent config
@ -26,6 +25,8 @@ categories = ['images']
 paging = True
 safesearch = True
 time_range_support = True
+language_support = True
+supported_languages_url = 'https://www.bing.com/account/general'

 # search-url
 base_url = 'https://www.bing.com/'
@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT',
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)


+# get supported region code
+def get_region_code(lang, lang_list=None):
+    region = None
+    if lang in (lang_list or supported_languages):
+        region = lang
+    elif lang.startswith('no'):
+        region = 'nb-NO'
+    else:
+        # try to get a supported country code with language
+        lang = lang.split('-')[0]
+        for lc in (lang_list or supported_languages):
+            if lang == lc.split('-')[0]:
+                region = lc
+                break
+    if region:
+        return region.lower()
+    else:
+        return 'en-us'
+
+
 # do search-request
 def request(query, params):
    offset = (params['pageno'] - 1) * 10 + 1

-    # required for cookie
-    if params['language'] == 'all':
-        language = 'en-US'
-    else:
-        language = params['language']
-
    search_path = search_string.format(
        query=urlencode({'q': query}),
        offset=offset)

+    language = get_region_code(params['language'])
+
    params['cookies']['SRCHHPGUSR'] = \
-        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
-        '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+        'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
+
+    params['cookies']['_EDGE_S'] = 'mkt=' + language +\
+        '&ui=' + language + '&F=1'

    params['url'] = base_url + search_path
    if params['time_range'] in time_range_dict:
@ -106,3 +125,22 @@ def response(resp):

    # return results
    return results
+
+
+# get supported languages from their site
+def _fetch_supported_languages(resp):
+    supported_languages = []
+    dom = html.fromstring(resp.text)
+
+    regions_xpath = '//div[@id="region-section-content"]' \
+                    + '//ul[@class="b_vList"]/li/a/@href'
+
+    regions = dom.xpath(regions_xpath)
+    for region in regions:
+        code = re.search('setmkt=[^\&]+', region).group()[7:]
+        if code == 'nb-NO':
+            code = 'no-NO'
+
+        supported_languages.append(code)
+
+    return supported_languages
--- a/searx/engines/bing_videos.py
+++ b/searx/engines/bing_videos.py
@ -12,6 +12,7 @@

 from json import loads
 from lxml import html
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode

@ -21,6 +22,7 @@ paging = True
 safesearch = True
 time_range_support = True
 number_of_results = 10
+language_support = True

 search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\
             'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5'
@ -45,7 +47,8 @@ def request(query, params):
        'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')

    # language cookie
-    params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1'
+    region = get_region_code(params['language'], lang_list=supported_languages)
+    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'

    # query and paging
    params['url'] = search_url.format(query=urlencode({'q': query}),
--- a/searx/engines/blekko_images.py
+++ b/searx/engines/blekko_images.py
@ -1,70 +0,0 @@
-"""
- Blekko (Images)
-
- @website     https://blekko.com
- @provide-api yes (inofficial)
-
- @using-api   yes
- @results     JSON
- @stable      yes
- @parse       url, title, img_src
-"""
-
-from json import loads
-from searx.url_utils import urlencode
-
-# engine dependent config
-categories = ['images']
-paging = True
-safesearch = True
-
-# search-url
-base_url = 'https://blekko.com'
-search_url = '/api/images?{query}&c={c}'
-
-# safesearch definitions
-safesearch_types = {2: '1',
-                    1: '',
-                    0: '0'}
-
-
-# do search-request
-def request(query, params):
-    c = (params['pageno'] - 1) * 48
-
-    params['url'] = base_url +\
-        search_url.format(query=urlencode({'q': query}),
-                          c=c)
-
-    if params['pageno'] != 1:
-        params['url'] += '&page={pageno}'.format(pageno=(params['pageno'] - 1))
-
-    # let Blekko know we wan't have profiling
-    params['cookies']['tag_lesslogging'] = '1'
-
-    # parse safesearch argument
-    params['cookies']['safesearch'] = safesearch_types.get(params['safesearch'], '')
-
-    return params
-
-
-# get response from search-request
-def response(resp):
-    results = []
-
-    search_results = loads(resp.text)
-
-    # return empty array if there are no results
-    if not search_results:
-        return []
-
-    for result in search_results:
-        # append result
-        results.append({'url': result['page_url'],
-                        'title': result['title'],
-                        'content': '',
-                        'img_src': result['url'],
-                        'template': 'images.html'})
-
-    # return results
-    return results
--- a/searx/engines/digg.py
+++ b/searx/engines/digg.py
@ -10,6 +10,8 @@
 @parse       url, title, content, publishedDate, thumbnail
 """

+import random
+import string
 from dateutil import parser
 from json import loads
 from lxml import html
@ -30,12 +32,17 @@ title_xpath = './/h2//a//text()'
 content_xpath = './/p//text()'
 pubdate_xpath = './/time'

+digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\
+    string.digits + "+_"
+

 # do search-request
 def request(query, params):
    offset = (params['pageno'] - 1) * 10
    params['url'] = search_url.format(position=offset,
                                      query=quote_plus(query))
+    params['cookies']['frontend.auid'] = ''.join(random.choice(
+        digg_cookie_chars) for _ in range(22))
    return params


--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@ -134,4 +134,4 @@ def _fetch_supported_languages(resp):
    regions_json = loads(response_page)
    supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())

-    return supported_languages
+    return list(supported_languages)
--- a/searx/engines/faroo.py
+++ b/searx/engines/faroo.py
@ -4,7 +4,7 @@
 @website     http://www.faroo.com
 @provide-api yes (http://www.faroo.com/hp/api/api.html), require API-key

- @using-api   yes
+ @using-api   no
 @results     JSON
 @stable      yes
 @parse       url, title, content, publishedDate, img_src
@ -20,18 +20,16 @@ categories = ['general', 'news']
 paging = True
 language_support = True
 number_of_results = 10
-api_key = None

 # search-url
 url = 'http://www.faroo.com/'
-search_url = url + 'api?{query}'\
-                      '&start={offset}'\
-                      '&length={number_of_results}'\
-                      '&l={language}'\
-                      '&src={categorie}'\
-                      '&i=false'\
-                      '&f=json'\
-                      '&key={api_key}'  # noqa
+search_url = url + 'instant.json?{query}'\
+    '&start={offset}'\
+    '&length={number_of_results}'\
+    '&l={language}'\
+    '&src={categorie}'\
+    '&i=false'\
+    '&c=false'

 search_category = {'general': 'web',
                   'news': 'news'}
@ -57,21 +55,15 @@ def request(query, params):
                                      number_of_results=number_of_results,
                                      query=urlencode({'q': query}),
                                      language=language,
-                                      categorie=categorie,
-                                      api_key=api_key)
+                                      categorie=categorie)

-    # using searx User-Agent
-    params['headers']['User-Agent'] = searx_useragent()
+    params['headers']['Referer'] = url

    return params


 # get response from search-request
 def response(resp):
-    # HTTP-Code 401: api-key is not valide
-    if resp.status_code == 401:
-        raise Exception("API key is not valide")
-
    # HTTP-Code 429: rate limit exceeded
    if resp.status_code == 429:
        raise Exception("rate limit has been exceeded!")
@ -86,31 +78,19 @@ def response(resp):

    # parse results
    for result in search_res['results']:
+        publishedDate = None
+        result_json = {'url': result['url'], 'title': result['title'],
+                       'content': result['kwic']}
        if result['news']:
-            # timestamp (milliseconds since 1970)
-            publishedDate = datetime.datetime.fromtimestamp(result['date'] / 1000.0)  # noqa
-
-            # append news result
-            results.append({'url': result['url'],
-                            'title': result['title'],
-                            'publishedDate': publishedDate,
-                            'content': result['kwic']})
-
-        else:
-            # append general result
-            # TODO, publishedDate correct?
-            results.append({'url': result['url'],
-                            'title': result['title'],
-                            'content': result['kwic']})
+            result_json['publishedDate'] = \
+                datetime.datetime.fromtimestamp(result['date'] / 1000.0)

        # append image result if image url is set
-        # TODO, show results with an image like in faroo
        if result['iurl']:
-            results.append({'template': 'images.html',
-                            'url': result['url'],
-                            'title': result['title'],
-                            'content': result['kwic'],
-                            'img_src': result['iurl']})
+            result_json['template'] = 'videos.html'
+            result_json['thumbnail'] = result['iurl']
+
+        results.append(result_json)

    # return results
    return results
--- a/searx/engines/generalfile.py
+++ b/searx/engines/generalfile.py
@ -1,62 +0,0 @@
-"""
- General Files (Files)
-
- @website     http://www.general-files.org
- @provide-api no (nothing found)
-
- @using-api   no (because nothing found)
- @results     HTML (using search portal)
- @stable      no (HTML can change)
- @parse       url, title, content
-
- @todo        detect torrents?
-"""
-
-from lxml import html
-
-# engine dependent config
-categories = ['files']
-paging = True
-
-# search-url
-base_url = 'http://www.general-file.com'
-search_url = base_url + '/files-{letter}/{query}/{pageno}'
-
-# specific xpath variables
-result_xpath = '//table[@class="block-file"]'
-title_xpath = './/h2/a//text()'
-url_xpath = './/h2/a/@href'
-content_xpath = './/p//text()'
-
-
-# do search-request
-def request(query, params):
-
-    params['url'] = search_url.format(query=query,
-                                      letter=query[0],
-                                      pageno=params['pageno'])
-
-    return params
-
-
-# get response from search-request
-def response(resp):
-    results = []
-
-    dom = html.fromstring(resp.text)
-
-    # parse results
-    for result in dom.xpath(result_xpath):
-        url = result.xpath(url_xpath)[0]
-
-        # skip fast download links
-        if not url.startswith('/'):
-            continue
-
-        # append result
-        results.append({'url': base_url + url,
-                        'title': ''.join(result.xpath(title_xpath)),
-                        'content': ''.join(result.xpath(content_xpath))})
-
-    # return results
-    return results
--- a/searx/engines/gigablast.py
+++ b/searx/engines/gigablast.py
@ -10,6 +10,7 @@
 @parse       url, title, content
 """

+import random
 from json import loads
 from time import time
 from lxml.html import fromstring
@ -32,7 +33,8 @@ search_string = 'search?{query}'\
    '&qh=0'\
    '&qlang={lang}'\
    '&ff={safesearch}'\
-    '&rxikd={rxikd}'  # random number - 9 digits
+    '&rxieu={rxieu}'\
+    '&rand={rxikd}'  # current unix timestamp

 # specific xpath variables
 results_xpath = '//response//result'
@ -59,10 +61,12 @@ def request(query, params):
    else:
        safesearch = 0

+    # rxieu is some kind of hash from the search query, but accepts random atm
    search_path = search_string.format(query=urlencode({'q': query}),
                                       offset=offset,
                                       number_of_results=number_of_results,
-                                       rxikd=str(time())[:9],
+                                       rxikd=int(time() * 1000),
+                                       rxieu=random.randint(1000000000, 9999999999),
                                       lang=language,
                                       safesearch=safesearch)

--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@ -67,8 +67,8 @@ def response(resp):
    for result in dom.xpath('//div[@class="g"]|//div[@class="g _cy"]'):
        try:
            r = {
-                'url': result.xpath('.//div[@class="_cnc"]//a/@href')[0],
-                'title': ''.join(result.xpath('.//div[@class="_cnc"]//h3//text()')),
+                'url': result.xpath('.//a[@class="l _PMs"]')[0].attrib.get("href"),
+                'title': ''.join(result.xpath('.//a[@class="l _PMs"]//text()')),
                'content': ''.join(result.xpath('.//div[@class="st"]//text()')),
            }
        except:
--- a/searx/engines/nyaa.py
+++ b/searx/engines/nyaa.py
@ -1,7 +1,7 @@
 """
- Nyaa.se (Anime Bittorrent tracker)
+ Nyaa.si (Anime Bittorrent tracker)

- @website      http://www.nyaa.se/
+ @website      http://www.nyaa.si/
 @provide-api  no
 @using-api    no
 @results      HTML
@ -12,50 +12,25 @@
 from lxml import html
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero

 # engine dependent config
 categories = ['files', 'images', 'videos', 'music']
 paging = True

 # search-url
-base_url = 'http://www.nyaa.se/'
+base_url = 'http://www.nyaa.si/'
 search_url = base_url + '?page=search&{query}&offset={offset}'

 # xpath queries
-xpath_results = '//table[@class="tlist"]//tr[contains(@class, "tlistrow")]'
-xpath_category = './/td[@class="tlisticon"]/a'
-xpath_title = './/td[@class="tlistname"]/a'
-xpath_torrent_file = './/td[@class="tlistdownload"]/a'
-xpath_filesize = './/td[@class="tlistsize"]/text()'
-xpath_seeds = './/td[@class="tlistsn"]/text()'
-xpath_leeches = './/td[@class="tlistln"]/text()'
-xpath_downloads = './/td[@class="tlistdn"]/text()'
-
-
-# convert a variable to integer or return 0 if it's not a number
-def int_or_zero(num):
-    if isinstance(num, list):
-        if len(num) < 1:
-            return 0
-        num = num[0]
-    if num.isdigit():
-        return int(num)
-    return 0
-
-
-# get multiplier to convert torrent size to bytes
-def get_filesize_mul(suffix):
-    return {
-        'KB': 1024,
-        'MB': 1024 ** 2,
-        'GB': 1024 ** 3,
-        'TB': 1024 ** 4,
-
-        'KIB': 1024,
-        'MIB': 1024 ** 2,
-        'GIB': 1024 ** 3,
-        'TIB': 1024 ** 4
-    }[str(suffix).upper()]
+xpath_results = '//table[contains(@class, "torrent-list")]//tr[not(th)]'
+xpath_category = './/td[1]/a[1]'
+xpath_title = './/td[2]/a[last()]'
+xpath_torrent_links = './/td[3]/a'
+xpath_filesize = './/td[4]/text()'
+xpath_seeds = './/td[6]/text()'
+xpath_leeches = './/td[7]/text()'
+xpath_downloads = './/td[8]/text()'


 # do search-request
@ -72,25 +47,32 @@ def response(resp):
    dom = html.fromstring(resp.text)

    for result in dom.xpath(xpath_results):
+        # defaults
+        filesize = 0
+        magnet_link = ""
+        torrent_link = ""
+
        # category in which our torrent belongs
-        category = result.xpath(xpath_category)[0].attrib.get('title')
+        try:
+            category = result.xpath(xpath_category)[0].attrib.get('title')
+        except:
+            pass

        # torrent title
        page_a = result.xpath(xpath_title)[0]
        title = extract_text(page_a)

        # link to the page
-        href = page_a.attrib.get('href')
+        href = base_url + page_a.attrib.get('href')

-        # link to the torrent file
-        torrent_link = result.xpath(xpath_torrent_file)[0].attrib.get('href')
-
-        # torrent size
-        try:
-            file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
-            file_size = int(float(file_size) * get_filesize_mul(suffix))
-        except:
-            file_size = None
+        for link in result.xpath(xpath_torrent_links):
+            url = link.attrib.get('href')
+            if 'magnet' in url:
+                # link to the magnet
+                magnet_link = url
+            else:
+                # link to the torrent file
+                torrent_link = url

        # seed count
        seed = int_or_zero(result.xpath(xpath_seeds))
@ -101,6 +83,14 @@ def response(resp):
        # torrent downloads count
        downloads = int_or_zero(result.xpath(xpath_downloads))

+        # let's try to calculate the torrent size
+        try:
+            filesize_info = result.xpath(xpath_filesize)[0]
+            filesize, filesize_multiplier = filesize_info.split()
+            filesize = get_torrent_size(filesize, filesize_multiplier)
+        except:
+            pass
+
        # content string contains all information not included into template
        content = 'Category: "{category}". Downloaded {downloads} times.'
        content = content.format(category=category, downloads=downloads)
@ -110,8 +100,9 @@ def response(resp):
                        'content': content,
                        'seed': seed,
                        'leech': leech,
-                        'filesize': file_size,
+                        'filesize': filesize,
                        'torrentfile': torrent_link,
+                        'magnetlink': magnet_link,
                        'template': 'torrent.html'})

    return results
--- a/searx/engines/swisscows.py
+++ b/searx/engines/swisscows.py
@ -118,7 +118,7 @@ def _fetch_supported_languages(resp):
    dom = fromstring(resp.text)
    options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
    for option in options:
-        code = option.xpath('./@data-val')[0]
+        code = option.xpath('./@data-search-language')[0]
        if code.startswith('nb-'):
            code = code.replace('nb', 'no', 1)
        supported_languages.append(code)
--- a/searx/engines/tokyotoshokan.py
+++ b/searx/engines/tokyotoshokan.py
@ -14,8 +14,8 @@ import re
 from lxml import html
 from searx.engines.xpath import extract_text
 from datetime import datetime
-from searx.engines.nyaa import int_or_zero, get_filesize_mul
 from searx.url_utils import urlencode
+from searx.utils import get_torrent_size, int_or_zero

 # engine dependent config
 categories = ['files', 'videos', 'music']
@ -76,8 +76,7 @@ def response(resp):
                try:
                    # ('1.228', 'GB')
                    groups = size_re.match(item).groups()
-                    multiplier = get_filesize_mul(groups[1])
-                    params['filesize'] = int(multiplier * float(groups[0]))
+                    params['filesize'] = get_torrent_size(groups[0], groups[1])
                except:
                    pass
            elif item.startswith('Date:'):
--- a/searx/engines/torrentz.py
+++ b/searx/engines/torrentz.py
@ -1,7 +1,7 @@
 """
- Torrentz.eu (BitTorrent meta-search engine)
+ Torrentz2.eu (BitTorrent meta-search engine)

- @website      https://torrentz.eu/
+ @website      https://torrentz2.eu/
 @provide-api  no

 @using-api    no
@ -14,24 +14,24 @@
 import re
 from lxml import html
 from datetime import datetime
-from searx.engines.nyaa import int_or_zero, get_filesize_mul
 from searx.engines.xpath import extract_text
 from searx.url_utils import urlencode
+from searx.utils import get_torrent_size

 # engine dependent config
 categories = ['files', 'videos', 'music']
 paging = True

 # search-url
-# https://torrentz.eu/search?f=EXAMPLE&p=6
-base_url = 'https://torrentz.eu/'
+# https://torrentz2.eu/search?f=EXAMPLE&p=6
+base_url = 'https://torrentz2.eu/'
 search_url = base_url + 'search?{query}'


 # do search-request
 def request(query, params):
    page = params['pageno'] - 1
-    query = urlencode({'q': query, 'p': page})
+    query = urlencode({'f': query, 'p': page})
    params['url'] = search_url.format(query=query)
    return params

@ -54,22 +54,29 @@ def response(resp):
        # extract url and remove a slash in the beginning
        link = links[0].attrib.get('href').lstrip('/')

-        seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '')
-        leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '')
+        seed = 0
+        leech = 0
+        try:
+            seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', ''))
+            leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', ''))
+        except:
+            pass

        params = {
            'url': base_url + link,
            'title': title,
-            'seed': int_or_zero(seed),
-            'leech': int_or_zero(leech),
+            'seed': seed,
+            'leech': leech,
            'template': 'torrent.html'
        }

        # let's try to calculate the torrent size
        try:
-            size_str = result.xpath('./dd/span[@class="s"]/text()')[0]
-            size, suffix = size_str.split()
-            params['filesize'] = int(size) * get_filesize_mul(suffix)
+            filesize_info = result.xpath('./dd/span[3]/text()')[0]
+            filesize, filesize_multiplier = filesize_info.split()
+            filesize = get_torrent_size(filesize, filesize_multiplier)
+
+            params['filesize'] = filesize
        except:
            pass

@ -80,9 +87,8 @@ def response(resp):

        # extract and convert creation date
        try:
-            date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title')
-            # Fri, 25 Mar 2016 16:29:01
-            date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S')
+            date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title')
+            date = datetime.fromtimestamp(float(date_ts))
            params['publishedDate'] = date
        except:
            pass