diff --git a/searx/webapp.py b/searx/webapp.py index 774f0f3bb..a6fabe9b1 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -18,6 +18,7 @@ import markdown import re import datetime from textrank4zh import TextRank4Keyword, TextRank4Sentence +from collections import defaultdict from timeit import default_timer from html import escape @@ -701,6 +702,8 @@ def search(): try: search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form) # search = Search(search_query) # without plugins + if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(search_query.query): + return index_error(output_format, 'No query'), 400 try: original_search_query = search_query.query if "模仿" in search_query.query or "扮演" in search_query.query or "你能" in search_query.query or "请推荐" in search_query.query or "帮我" in search_query.query or "写一段" in search_query.query or "写一个" in search_query.query or "请问" in search_query.query or "请给" in search_query.query or "请你" in search_query.query or "请推荐" in search_query.query or "是谁" in search_query.query or "能帮忙" in search_query.query or "介绍一下" in search_query.query or "为什么" in search_query.query or "什么是" in search_query.query or "有什么" in search_query.query or "怎样" in search_query.query or "给我" in search_query.query or "如何" in search_query.query or "谁是" in search_query.query or "查询" in search_query.query or "告诉我" in search_query.query or "查一下" in search_query.query or "找一个" in search_query.query or "什么样" in search_query.query or "哪个" in search_query.query or "哪些" in search_query.query or "哪一个" in search_query.query or "哪一些" in search_query.query or "啥是" in search_query.query or "为啥" in search_query.query or "怎么" in search_query.query: @@ -713,7 +716,7 @@ def search(): search_type = '任务' net_search = False net_search_str = 'false' - elif len(original_query)>10: + elif len(original_search_query)>10: prompt = "任务:写诗 写故事 写代码 写论文摘要 模仿推特用户 生成搜索广告 回答问题 聊天话题 搜索网页 搜索视频 搜索地图 搜索新闻 查看食谱 搜索商品 写歌词 写论文 模仿名人 翻译语言 摘要文章 讲笑话 做数学题 搜索图片 播放音乐 查看天气\n1.判断是以上任务的哪一个2.判断是否需要联网回答3.给出搜索关键词\n" prompt = prompt + "提问:" + search_query.query + '答案用json数组例如["写诗","否","详细关键词"]来表述\n答案:' acts = ['写诗', '写故事', '写代码', '写论文摘要', '模仿推特用户', '生成搜索广告', '回答问题', '聊天话题', '搜索网页', '搜索视频', '搜索地图', '搜索新闻', '查看食谱', '搜索商品', '写歌词', '写论文', '模仿名人', '翻译语言', '摘要文章', '讲笑话', '做数学题', '搜索图片', '播放音乐', '查看天气'] @@ -739,9 +742,10 @@ def search(): "logprobs": 0, "stream": False } + gpt_json={} if prompt and prompt !='' : gpt_response = requests.post(gpt_url, headers=gpt_headers, data=json.dumps(gpt_data)) - gpt_json = gpt_response.json() + gpt_json = gpt_response.json() if 'choices' in gpt_json: gpt = gpt_json['choices'][0]['text'] if search_type == '任务': @@ -795,9 +799,15 @@ def search(): url_proxy = {} prompt = "" for res in results: + results.remove(res) if 'url' not in res: continue - if 'content' not in res: continue if 'title' not in res: continue + if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['title']): + return index_error(output_format, 'No query'), 400 + if 'content' not in res: continue + if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['content']): + return index_error(output_format, 'No query'), 400 + if res['content'] == '': continue new_url = 'https://url'+str(len(url_pair)) url_pair.append(res['url']) @@ -1182,7 +1192,8 @@ const search_type = "''' + search_type + r'''" const net_search = ''' + net_search_str + r''' @@ -1931,8 +1942,71 @@ if not werkzeug_reloader or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_ search_initialize(enable_checker=True, check_network=True, enable_metrics=settings['general']['enable_metrics']) +class DFAFilter(): + def __init__(self): + self.keyword_chains = {} + self.delimit = '\x00' + + def add(self, keyword): + if not isinstance(keyword, unicode): + keyword = keyword.decode('utf-8') + keyword = keyword.lower() + chars = keyword.strip() + if not chars: + return + level = self.keyword_chains + for i in range(len(chars)): + if chars[i] in level: + level = level[chars[i]] + else: + if not isinstance(level, dict): + break + for j in range(i, len(chars)): + level[chars[j]] = {} + last_level, last_char = level, chars[j] + level = level[chars[j]] + last_level[last_char] = {self.delimit: 0} + break + if i == len(chars) - 1: + level[self.delimit] = 0 + + def parse(self, path): + with open(path) as f: + for keyword in f: + self.add(keyword.strip()) + + def filter(self, message, repl="*"): + if not isinstance(message, unicode): + message = message.decode('utf-8') + message = message.lower() + ret = [] + start = 0 + while start < len(message): + level = self.keyword_chains + step_ins = 0 + for char in message[start:]: + if char in level: + step_ins += 1 + if self.delimit not in level[char]: + level = level[char] + else: + return True + ret.append(repl * step_ins) + start += step_ins - 1 + break + else: + ret.append(message[start]) + break + else: + ret.append(message[start]) + start += 1 + + return False +gfw = DFAFilter() def run(): logger.debug('starting webserver on %s:%s', settings['server']['bind_address'], settings['server']['port']) + + gfw.parse("keywords") app.run( debug=searx_debug, use_debugger=searx_debug, diff --git a/searx/webapp1.py b/searx/webapp1.py index b21d10d81..336d280fd 100644 --- a/searx/webapp1.py +++ b/searx/webapp1.py @@ -803,6 +803,7 @@ def search(): if 'url' not in res: continue if 'title' not in res: continue if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['title']): + return index_error(output_format, 'No query'), 400 if 'content' not in res: continue if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['content']): return index_error(output_format, 'No query'), 400