diff --git a/requirements-dev.txt b/requirements-dev.txt index 7bcbd8251..fd1939ec4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,4 +22,4 @@ wlc==1.13 coloredlogs==15.0.1 requests markdown -tiktoken \ No newline at end of file +transformers \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 016484508..e26de1a49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,4 @@ typing_extensions==4.5.0 fasttext-predict==0.9.2.1 requests markdown -tiktoken \ No newline at end of file +transformers \ No newline at end of file diff --git a/searx/webapp.py b/searx/webapp.py index c62eb5fd9..d3a751ded 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -17,7 +17,7 @@ import requests import markdown import re import datetime -import tiktoken +from transformers import GPT2TokenizerFast from timeit import default_timer from html import escape @@ -769,7 +769,8 @@ def search(): res['content'] = res['content'].replace("This Tweet was deleted by the Tweet author.","Deleted Tweet.") tmp_prompt = res['title'] +'\n'+ res['content'] + '\n' + new_url +'\n' - if len( tiktoken.get_encoding("gpt2").encode(prompt + tmp_prompt +'\n' + "\n以上是问题 " + original_search_query + " 的搜索结果,删除与问题相关度低的内容,用简体中文分条总结简报,在文中用(链接)标注对应内容来源链接,不要把链接都放在最后。结果:") )<2990: + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") + if len( tokenizer(prompt + tmp_prompt +'\n' + "\n以上是问题 " + original_search_query + " 的搜索结果,删除与问题相关度低的内容,用简体中文分条总结简报,在文中用(链接)标注对应内容来源链接,不要把链接都放在最后。结果:")['input_ids'] )<2990: prompt += tmp_prompt +'\n' if prompt != "": gpt = ""