mirror of https://github.com/searxng/searxng.git
[enh] better url comparison
This commit is contained in:
parent
34941aca3f
commit
70cbc09e93
|
@ -22,6 +22,7 @@ from imp import load_source
|
||||||
import grequests
|
import grequests
|
||||||
from itertools import izip_longest, chain
|
from itertools import izip_longest, chain
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
from urlparse import urlparse
|
||||||
|
|
||||||
engine_dir = dirname(realpath(__file__))
|
engine_dir = dirname(realpath(__file__))
|
||||||
|
|
||||||
|
@ -87,16 +88,23 @@ def search(query, request, selected_engines):
|
||||||
results = []
|
results = []
|
||||||
# deduplication + scoring
|
# deduplication + scoring
|
||||||
for i,res in enumerate(flat_res):
|
for i,res in enumerate(flat_res):
|
||||||
|
res['parsed_url'] = urlparse(res['url'])
|
||||||
score = flat_len - i
|
score = flat_len - i
|
||||||
duplicated = False
|
duplicated = False
|
||||||
for new_res in results:
|
for new_res in results:
|
||||||
if res['url'] == new_res['url']:
|
if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
|
||||||
|
res['parsed_url'].path == new_res['parsed_url'].path:
|
||||||
duplicated = new_res
|
duplicated = new_res
|
||||||
break
|
break
|
||||||
if duplicated:
|
if duplicated:
|
||||||
if len(res.get('content', '')) > len(duplicated.get('content', '')):
|
if len(res.get('content', '')) > len(duplicated.get('content', '')):
|
||||||
duplicated['content'] = res['content']
|
duplicated['content'] = res['content']
|
||||||
duplicated['score'] += score
|
duplicated['score'] += score
|
||||||
|
if duplicated['parsed_url'].scheme == 'https':
|
||||||
|
continue
|
||||||
|
elif res['parsed_url'].scheme == 'https':
|
||||||
|
duplicated['parsed_url'].scheme == 'https'
|
||||||
|
duplicated['url'] = duplicated['parsed_url'].geturl()
|
||||||
else:
|
else:
|
||||||
res['score'] = score
|
res['score'] = score
|
||||||
results.append(res)
|
results.append(res)
|
||||||
|
|
Loading…
Reference in New Issue