forked from zaclys/searxng
		
	[fix] checker: fix memory usage
* download images using the "image_proxy" network (HTTP/1 instead of HTTP/2) * don't cache data: URL (reduce memory usage) * after each test: purge image URL cache then call garbage collector * download only the first 64kb of images
This commit is contained in:
		
							parent
							
								
									ca67f1555a
								
							
						
					
					
						commit
						2eab89b4ca
					
				
					 1 changed files with 48 additions and 27 deletions
				
			
		| 
						 | 
					@ -1,5 +1,6 @@
 | 
				
			||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
					# SPDX-License-Identifier: AGPL-3.0-or-later
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import gc
 | 
				
			||||||
import typing
 | 
					import typing
 | 
				
			||||||
import types
 | 
					import types
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
| 
						 | 
					@ -14,6 +15,7 @@ from langdetect.lang_detect_exception import LangDetectException
 | 
				
			||||||
import httpx
 | 
					import httpx
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from searx import network, logger
 | 
					from searx import network, logger
 | 
				
			||||||
 | 
					from searx.utils import gen_useragent
 | 
				
			||||||
from searx.results import ResultContainer
 | 
					from searx.results import ResultContainer
 | 
				
			||||||
from searx.search.models import SearchQuery, EngineRef
 | 
					from searx.search.models import SearchQuery, EngineRef
 | 
				
			||||||
from searx.search.processors import EngineProcessor
 | 
					from searx.search.processors import EngineProcessor
 | 
				
			||||||
| 
						 | 
					@ -58,7 +60,47 @@ def _is_url(url):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@functools.lru_cache(maxsize=8192)
 | 
					@functools.lru_cache(maxsize=8192)
 | 
				
			||||||
def _is_url_image(image_url):
 | 
					def _download_and_check_if_image(image_url: str) -> bool:
 | 
				
			||||||
 | 
					    """Download an URL and check if the Content-Type starts with "image/"
 | 
				
			||||||
 | 
					    This function should not be called directly: use _is_url_image
 | 
				
			||||||
 | 
					    otherwise the cache of functools.lru_cache contains data: URL which might be huge.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    retry = 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    while retry > 0:
 | 
				
			||||||
 | 
					        a = time()
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            # use "image_proxy" (avoid HTTP/2)
 | 
				
			||||||
 | 
					            network.set_context_network_name('image_proxy')
 | 
				
			||||||
 | 
					            stream = network.stream('GET', image_url, timeout=10.0, allow_redirects=True, headers={
 | 
				
			||||||
 | 
					                'User-Agent': gen_useragent(),
 | 
				
			||||||
 | 
					                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 | 
				
			||||||
 | 
					                'Accept-Language': 'en-US;q=0.5,en;q=0.3',
 | 
				
			||||||
 | 
					                'Accept-Encoding': 'gzip, deflate, br',
 | 
				
			||||||
 | 
					                'DNT': '1',
 | 
				
			||||||
 | 
					                'Connection': 'keep-alive',
 | 
				
			||||||
 | 
					                'Upgrade-Insecure-Requests': '1',
 | 
				
			||||||
 | 
					                'Sec-GPC': '1',
 | 
				
			||||||
 | 
					                'Cache-Control': 'max-age=0'
 | 
				
			||||||
 | 
					            })
 | 
				
			||||||
 | 
					            r = next(stream)
 | 
				
			||||||
 | 
					            r.close()
 | 
				
			||||||
 | 
					            is_image = r.headers["content-type"].startswith('image/')
 | 
				
			||||||
 | 
					            del r
 | 
				
			||||||
 | 
					            del stream
 | 
				
			||||||
 | 
					            return is_image
 | 
				
			||||||
 | 
					        except httpx.TimeoutException:
 | 
				
			||||||
 | 
					            logger.error('Timeout for %s: %i', image_url, int(time() - a))
 | 
				
			||||||
 | 
					            retry -= 1
 | 
				
			||||||
 | 
					        except httpx.HTTPError:
 | 
				
			||||||
 | 
					            logger.exception('Exception for %s', image_url)
 | 
				
			||||||
 | 
					            return False
 | 
				
			||||||
 | 
					    return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _is_url_image(image_url) -> bool:
 | 
				
			||||||
 | 
					    """Normalize image_url
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    if not isinstance(image_url, str):
 | 
					    if not isinstance(image_url, str):
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -71,32 +113,7 @@ def _is_url_image(image_url):
 | 
				
			||||||
    if not _is_url(image_url):
 | 
					    if not _is_url(image_url):
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    retry = 2
 | 
					    return _download_and_check_if_image(image_url)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    while retry > 0:
 | 
					 | 
				
			||||||
        a = time()
 | 
					 | 
				
			||||||
        try:
 | 
					 | 
				
			||||||
            network.set_timeout_for_thread(10.0, time())
 | 
					 | 
				
			||||||
            r = network.get(image_url, timeout=10.0, allow_redirects=True, headers={
 | 
					 | 
				
			||||||
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0',
 | 
					 | 
				
			||||||
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 | 
					 | 
				
			||||||
                'Accept-Language': 'en-US;q=0.5,en;q=0.3',
 | 
					 | 
				
			||||||
                'Accept-Encoding': 'gzip, deflate, br',
 | 
					 | 
				
			||||||
                'DNT': '1',
 | 
					 | 
				
			||||||
                'Connection': 'keep-alive',
 | 
					 | 
				
			||||||
                'Upgrade-Insecure-Requests': '1',
 | 
					 | 
				
			||||||
                'Sec-GPC': '1',
 | 
					 | 
				
			||||||
                'Cache-Control': 'max-age=0'
 | 
					 | 
				
			||||||
            })
 | 
					 | 
				
			||||||
            if r.headers["content-type"].startswith('image/'):
 | 
					 | 
				
			||||||
                return True
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
        except httpx.TimeoutException:
 | 
					 | 
				
			||||||
            logger.error('Timeout for %s: %i', image_url, int(time() - a))
 | 
					 | 
				
			||||||
            retry -= 1
 | 
					 | 
				
			||||||
        except httpx.HTTPError:
 | 
					 | 
				
			||||||
            logger.exception('Exception for %s', image_url)
 | 
					 | 
				
			||||||
            return False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
 | 
					def _search_query_to_dict(search_query: SearchQuery) -> typing.Dict[str, typing.Any]:
 | 
				
			||||||
| 
						 | 
					@ -414,3 +431,7 @@ class Checker:
 | 
				
			||||||
    def run(self):
 | 
					    def run(self):
 | 
				
			||||||
        for test_name in self.tests:
 | 
					        for test_name in self.tests:
 | 
				
			||||||
            self.run_test(test_name)
 | 
					            self.run_test(test_name)
 | 
				
			||||||
 | 
					            # clear cache
 | 
				
			||||||
 | 
					            _download_and_check_if_image.cache_clear()
 | 
				
			||||||
 | 
					            # force a garbage collector
 | 
				
			||||||
 | 
					            gc.collect()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue