c

2024-01-01 19:24:07 +01:00 · 2023-03-02 00:00:53 +08:00 · 2023-03-02 00:00:53 +08:00 · a1049908d0
commit a1049908d0
parent 4216238718
2 changed files with 287 additions and 12 deletions
--- a/searx/webapp.py
+++ b/searx/webapp.py
--- a/searx/webapp1.py
+++ b/searx/webapp1.py
@ -702,7 +702,7 @@ def search():
    try:
        search_query, raw_text_query, _, _ = get_search_query_from_webapp(request.preferences, request.form)
        # search = Search(search_query) #  without plugins
-        if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(search_query.query):
+        if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.exists(search_query.query):
            return index_error(output_format, 'No query'), 400
        try:
            original_search_query = search_query.query
@ -801,10 +801,10 @@ def search():
        for res in results:
            if 'url' not in res: continue
            if 'title' not in res: continue
-            if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['title']):
+            if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.exists(res['title']):
                return index_error(output_format, 'No query'), 400
            if 'content' not in res: continue
-            if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.filter(res['content']):
+            if request.environ['HTTP_CF_IPCOUNTRY'] == 'CN' and gfw.exists(res['content']):
                return index_error(output_format, 'No query'), 400
            if res['content'] == '': continue
@ -2987,7 +2987,145 @@ if not werkzeug_reloader or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_
    search_initialize(enable_checker=True, check_network=True, enable_metrics=settings['general']['enable_metrics'])
-class DFAFilter():
+class DFA:
    def __init__(self, path: str = None):
        self.ban_words_set = set()
        self.ban_words_list = list()
        self.ban_words_dict = dict()
        if not path:
            self.path = 'Data/Danger.form'
        else:
            self.path = path
        self.get_words()
    # 获取敏感词列表
    def get_words(self):
        with open(self.path, 'r', encoding='utf-8-sig') as f:
            for s in f:
                if s.find('\\r'):
                    s = s.replace('\r', '')
                s = s.replace('\n', '')
                s = s.strip()
                if len(s) == 0:
                    continue
                if str(s) and s not in self.ban_words_set:
                    self.ban_words_set.add(s)
                    self.ban_words_list.append(str(s))
                    sentence = pycorrector.simplified2traditional(s)
                    if sentence != s:
                        self.ban_words_set.add(sentence)
                        self.ban_words_list.append(str(sentence))
        self.add_hash_dict(self.ban_words_list)
    def change_words(self, path):
        self.ban_words_list.clear()
        self.ban_words_dict.clear()
        self.ban_words_set.clear()
        self.path = path
        self.get_words()
    # 将敏感词列表转换为DFA字典序
    def add_hash_dict(self, new_list):
        for x in new_list:
            self.add_new_word(x)
    # 添加单个敏感词
    def add_new_word(self, new_word):
        new_word = str(new_word)
        # print(new_word)
        now_dict = self.ban_words_dict
        i = 0
        for x in new_word:
            if x not in now_dict:
                x = str(x)
                new_dict = dict()
                new_dict['is_end'] = False
                now_dict[x] = new_dict
                now_dict = new_dict
            else:
                now_dict = now_dict[x]
            if i == len(new_word) - 1:
                now_dict['is_end'] = True
            i += 1
    # 寻找第一次出现敏感词的位置
    def find_illegal(self, _str):
        now_dict = self.ban_words_dict
        i = 0
        start_word = -1
        is_start = True  # 判断是否是一个敏感词的开始
        while i < len(_str):
            if _str[i] not in now_dict:
                if is_start is True:
                    i += 1
                    continue
                i = start_word + 1
                start_word = -1
                is_start = True
                now_dict = self.ban_words_dict
            else:
                if is_start is True:
                    start_word = i
                    is_start = False
                now_dict = now_dict[_str[i]]
                if now_dict['is_end'] is True:
                    return start_word
                else:
                    i += 1
        return -1
    # 查找是否存在敏感词
    def exists(self, sentence):
        pos = self.find_illegal(sentence)
        _sentence = re.sub('\W+', '', sentence).replace("_", '')
        _pos = self.find_illegal(_sentence)
        if pos == -1 and _pos == -1:
            return False
        else:
            return True
    # 将指定位置的敏感词替换为*
    def filter_words(self, filter_str, pos):
        now_dict = self.ban_words_dict
        end_str = int()
        for i in range(pos, len(filter_str)):
            if now_dict[filter_str[i]]['is_end'] is True:
                end_str = i
                break
            now_dict = now_dict[filter_str[i]]
        num = end_str - pos + 1
        filter_str = filter_str[:pos] + '喵' * num + filter_str[end_str + 1:]
        return filter_str
    def filter_all(self, s):
        pos_list = list()
        ss = DFA.draw_words(s, pos_list)
        illegal_pos = self.find_illegal(ss)
        while illegal_pos != -1:
            ss = self.filter_words(ss, illegal_pos)
            illegal_pos = self.find_illegal(ss)
        i = 0
        while i < len(ss):
            if ss[i] == '喵':
                start = pos_list[i]
                while i < len(ss) and ss[i] == '喵':
                    i += 1
                i -= 1
                end = pos_list[i]
                num = end - start + 1
                s = s[:start] + '喵' * num + s[end + 1:]
            i += 1
        return s
    @staticmethod
    def draw_words(_str, pos_list):
        ss = str()
        for i in range(len(_str)):
            if '\u4e00' <= _str[i] <= '\u9fa5' or '\u3400' <= _str[i] <= '\u4db5' or '\u0030' <= _str[i] <= '\u0039' \
                    or '\u0061' <= _str[i] <= '\u007a' or '\u0041' <= _str[i] <= '\u005a':
                ss += _str[i]
                pos_list.append(i)
        return ss
    def __init__(self):
        self.keyword_chains = {}
        self.delimit = '\x00'
@ -3047,7 +3185,7 @@ class DFAFilter():
            start += 1
        return False
-gfw = DFAFilter()
+gfw = DFA()
 def run():
    logger.debug('starting webserver on %s:%s', settings['server']['bind_address'], settings['server']['port'])