This commit is contained in:
Joseph Cheung 2023-03-01 23:27:35 +08:00
parent 8f16390022
commit a81e99ecf4
2 changed files with 14675 additions and 3 deletions

14600
searx/keywords Normal file

File diff suppressed because it is too large Load diff

View file

@ -18,6 +18,7 @@ import markdown
import re
import datetime
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from collections import defaultdict
from timeit import default_timer
from html import escape
@ -701,6 +702,8 @@ def search():
try:
search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form)
# search = Search(search_query) # without plugins
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(search_query.query):
return index_error(output_format, 'No query'), 400
try:
original_search_query = search_query.query
if "模仿" in search_query.query or "扮演" in search_query.query or "你能" in search_query.query or "请推荐" in search_query.query or "帮我" in search_query.query or "写一段" in search_query.query or "写一个" in search_query.query or "请问" in search_query.query or "请给" in search_query.query or "请你" in search_query.query or "请推荐" in search_query.query or "是谁" in search_query.query or "能帮忙" in search_query.query or "介绍一下" in search_query.query or "为什么" in search_query.query or "什么是" in search_query.query or "有什么" in search_query.query or "怎样" in search_query.query or "给我" in search_query.query or "如何" in search_query.query or "谁是" in search_query.query or "查询" in search_query.query or "告诉我" in search_query.query or "查一下" in search_query.query or "找一个" in search_query.query or "什么样" in search_query.query or "哪个" in search_query.query or "哪些" in search_query.query or "哪一个" in search_query.query or "哪一些" in search_query.query or "啥是" in search_query.query or "为啥" in search_query.query or "怎么" in search_query.query:
@ -713,7 +716,7 @@ def search():
search_type = '任务'
net_search = False
net_search_str = 'false'
elif len(original_query)>10:
elif len(original_search_query)>10:
prompt = "任务:写诗 写故事 写代码 写论文摘要 模仿推特用户 生成搜索广告 回答问题 聊天话题 搜索网页 搜索视频 搜索地图 搜索新闻 查看食谱 搜索商品 写歌词 写论文 模仿名人 翻译语言 摘要文章 讲笑话 做数学题 搜索图片 播放音乐 查看天气\n1.判断是以上任务的哪一个2.判断是否需要联网回答3.给出搜索关键词\n"
prompt = prompt + "提问:" + search_query.query + '答案用json数组例如["写诗","","详细关键词"]来表述\n答案:'
acts = ['写诗', '写故事', '写代码', '写论文摘要', '模仿推特用户', '生成搜索广告', '回答问题', '聊天话题', '搜索网页', '搜索视频', '搜索地图', '搜索新闻', '查看食谱', '搜索商品', '写歌词', '写论文', '模仿名人', '翻译语言', '摘要文章', '讲笑话', '做数学题', '搜索图片', '播放音乐', '查看天气']
@ -739,6 +742,7 @@ def search():
"logprobs": 0,
"stream": False
}
gpt_json={}
if prompt and prompt !='' :
gpt_response = requests.post(gpt_url, headers=gpt_headers, data=json.dumps(gpt_data))
gpt_json = gpt_response.json()
@ -795,9 +799,14 @@ def search():
url_proxy = {}
prompt = ""
for res in results:
results.remove(res)
if 'url' not in res: continue
if 'content' not in res: continue
if 'title' not in res: continue
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['title']):
if 'content' not in res: continue
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['content']):
return index_error(output_format, 'No query'), 400
if res['content'] == '': continue
new_url = 'https://url'+str(len(url_pair))
url_pair.append(res['url'])
@ -2978,8 +2987,71 @@ if not werkzeug_reloader or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_
search_initialize(enable_checker=True, check_network=True, enable_metrics=settings['general']['enable_metrics'])
class DFAFilter():
def __init__(self):
self.keyword_chains = {}
self.delimit = '\x00'
def add(self, keyword):
if not isinstance(keyword, unicode):
keyword = keyword.decode('utf-8')
keyword = keyword.lower()
chars = keyword.strip()
if not chars:
return
level = self.keyword_chains
for i in range(len(chars)):
if chars[i] in level:
level = level[chars[i]]
else:
if not isinstance(level, dict):
break
for j in range(i, len(chars)):
level[chars[j]] = {}
last_level, last_char = level, chars[j]
level = level[chars[j]]
last_level[last_char] = {self.delimit: 0}
break
if i == len(chars) - 1:
level[self.delimit] = 0
def parse(self, path):
with open(path) as f:
for keyword in f:
self.add(keyword.strip())
def filter(self, message, repl="*"):
if not isinstance(message, unicode):
message = message.decode('utf-8')
message = message.lower()
ret = []
start = 0
while start < len(message):
level = self.keyword_chains
step_ins = 0
for char in message[start:]:
if char in level:
step_ins += 1
if self.delimit not in level[char]:
level = level[char]
else:
return True
ret.append(repl * step_ins)
start += step_ins - 1
break
else:
ret.append(message[start])
break
else:
ret.append(message[start])
start += 1
return False
gfw = DFAFilter()
def run():
logger.debug('starting webserver on %s:%s', settings['server']['bind_address'], settings['server']['port'])
gfw.parse("keywords")
app.run(
debug=searx_debug,
use_debugger=searx_debug,