mirror of
https://github.com/searxng/searxng
synced 2024-01-01 19:24:07 +01:00
c
This commit is contained in:
parent
4216238718
commit
a1049908d0
2 changed files with 287 additions and 12 deletions
151
searx/webapp.py
151
searx/webapp.py
File diff suppressed because one or more lines are too long
148
searx/webapp1.py
148
searx/webapp1.py
|
@ -702,7 +702,7 @@ def search():
|
|||
try:
|
||||
search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form)
|
||||
# search = Search(search_query) # without plugins
|
||||
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(search_query.query):
|
||||
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.exists(search_query.query):
|
||||
return index_error(output_format, 'No query'), 400
|
||||
try:
|
||||
original_search_query = search_query.query
|
||||
|
@ -801,10 +801,10 @@ def search():
|
|||
for res in results:
|
||||
if 'url' not in res: continue
|
||||
if 'title' not in res: continue
|
||||
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['title']):
|
||||
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.exists(res['title']):
|
||||
return index_error(output_format, 'No query'), 400
|
||||
if 'content' not in res: continue
|
||||
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['content']):
|
||||
if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.exists(res['content']):
|
||||
return index_error(output_format, 'No query'), 400
|
||||
|
||||
if res['content'] == '': continue
|
||||
|
@ -2987,7 +2987,145 @@ if not werkzeug_reloader or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_
|
|||
search_initialize(enable_checker=True, check_network=True, enable_metrics=settings['general']['enable_metrics'])
|
||||
|
||||
|
||||
class DFAFilter():
|
||||
class DFA:
|
||||
def __init__(self, path: str = None):
|
||||
self.ban_words_set = set()
|
||||
self.ban_words_list = list()
|
||||
self.ban_words_dict = dict()
|
||||
if not path:
|
||||
self.path = 'Data/Danger.form'
|
||||
else:
|
||||
self.path = path
|
||||
self.get_words()
|
||||
|
||||
# 获取敏感词列表
|
||||
def get_words(self):
|
||||
with open(self.path, 'r', encoding='utf-8-sig') as f:
|
||||
for s in f:
|
||||
if s.find('\\r'):
|
||||
s = s.replace('\r', '')
|
||||
s = s.replace('\n', '')
|
||||
s = s.strip()
|
||||
if len(s) == 0:
|
||||
continue
|
||||
if str(s) and s not in self.ban_words_set:
|
||||
self.ban_words_set.add(s)
|
||||
self.ban_words_list.append(str(s))
|
||||
sentence = pycorrector.simplified2traditional(s)
|
||||
if sentence != s:
|
||||
self.ban_words_set.add(sentence)
|
||||
self.ban_words_list.append(str(sentence))
|
||||
self.add_hash_dict(self.ban_words_list)
|
||||
|
||||
def change_words(self, path):
|
||||
self.ban_words_list.clear()
|
||||
self.ban_words_dict.clear()
|
||||
self.ban_words_set.clear()
|
||||
self.path = path
|
||||
self.get_words()
|
||||
|
||||
# 将敏感词列表转换为DFA字典序
|
||||
def add_hash_dict(self, new_list):
|
||||
for x in new_list:
|
||||
self.add_new_word(x)
|
||||
|
||||
# 添加单个敏感词
|
||||
def add_new_word(self, new_word):
|
||||
new_word = str(new_word)
|
||||
# print(new_word)
|
||||
now_dict = self.ban_words_dict
|
||||
i = 0
|
||||
for x in new_word:
|
||||
if x not in now_dict:
|
||||
x = str(x)
|
||||
new_dict = dict()
|
||||
new_dict['is_end'] = False
|
||||
now_dict[x] = new_dict
|
||||
now_dict = new_dict
|
||||
else:
|
||||
now_dict = now_dict[x]
|
||||
if i == len(new_word) - 1:
|
||||
now_dict['is_end'] = True
|
||||
i += 1
|
||||
|
||||
# 寻找第一次出现敏感词的位置
|
||||
def find_illegal(self, _str):
|
||||
now_dict = self.ban_words_dict
|
||||
i = 0
|
||||
start_word = -1
|
||||
is_start = True # 判断是否是一个敏感词的开始
|
||||
while i < len(_str):
|
||||
if _str[i] not in now_dict:
|
||||
if is_start is True:
|
||||
i += 1
|
||||
continue
|
||||
i = start_word + 1
|
||||
start_word = -1
|
||||
is_start = True
|
||||
now_dict = self.ban_words_dict
|
||||
else:
|
||||
if is_start is True:
|
||||
start_word = i
|
||||
is_start = False
|
||||
now_dict = now_dict[_str[i]]
|
||||
if now_dict['is_end'] is True:
|
||||
return start_word
|
||||
else:
|
||||
i += 1
|
||||
return -1
|
||||
|
||||
# 查找是否存在敏感词
|
||||
def exists(self, sentence):
|
||||
pos = self.find_illegal(sentence)
|
||||
_sentence = re.sub('\W+', '', sentence).replace("_", '')
|
||||
_pos = self.find_illegal(_sentence)
|
||||
if pos == -1 and _pos == -1:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
# 将指定位置的敏感词替换为*
|
||||
def filter_words(self, filter_str, pos):
|
||||
now_dict = self.ban_words_dict
|
||||
end_str = int()
|
||||
for i in range(pos, len(filter_str)):
|
||||
if now_dict[filter_str[i]]['is_end'] is True:
|
||||
end_str = i
|
||||
break
|
||||
now_dict = now_dict[filter_str[i]]
|
||||
num = end_str - pos + 1
|
||||
filter_str = filter_str[:pos] + '喵' * num + filter_str[end_str + 1:]
|
||||
return filter_str
|
||||
|
||||
def filter_all(self, s):
|
||||
pos_list = list()
|
||||
ss = DFA.draw_words(s, pos_list)
|
||||
illegal_pos = self.find_illegal(ss)
|
||||
while illegal_pos != -1:
|
||||
ss = self.filter_words(ss, illegal_pos)
|
||||
illegal_pos = self.find_illegal(ss)
|
||||
i = 0
|
||||
while i < len(ss):
|
||||
if ss[i] == '喵':
|
||||
start = pos_list[i]
|
||||
while i < len(ss) and ss[i] == '喵':
|
||||
i += 1
|
||||
i -= 1
|
||||
end = pos_list[i]
|
||||
num = end - start + 1
|
||||
s = s[:start] + '喵' * num + s[end + 1:]
|
||||
i += 1
|
||||
return s
|
||||
|
||||
@staticmethod
|
||||
def draw_words(_str, pos_list):
|
||||
ss = str()
|
||||
for i in range(len(_str)):
|
||||
if '\u4e00' <= _str[i] <= '\u9fa5' or '\u3400' <= _str[i] <= '\u4db5' or '\u0030' <= _str[i] <= '\u0039' \
|
||||
or '\u0061' <= _str[i] <= '\u007a' or '\u0041' <= _str[i] <= '\u005a':
|
||||
ss += _str[i]
|
||||
pos_list.append(i)
|
||||
return ss
|
||||
def __init__(self):
|
||||
self.keyword_chains = {}
|
||||
self.delimit = '\x00'
|
||||
|
@ -3047,7 +3185,7 @@ class DFAFilter():
|
|||
start += 1
|
||||
|
||||
return False
|
||||
gfw = DFAFilter()
|
||||
gfw = DFA()
|
||||
def run():
|
||||
logger.debug('starting webserver on %s:%s', settings['server']['bind_address'], settings['server']['port'])
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue