forked from zaclys/searxng
		
	Merge pull request #2285 from return42/fix-digg
bugfix & refactor digg engine
This commit is contained in:
		
						commit
						6b5a578822
					
				
					 2 changed files with 42 additions and 43 deletions
				
			
		
							
								
								
									
										12
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										12
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -212,15 +212,15 @@ gecko.driver: | ||||||
| PHONY += test test.sh test.pylint test.pep8 test.unit test.coverage test.robot | PHONY += test test.sh test.pylint test.pep8 test.unit test.coverage test.robot | ||||||
| test: buildenv test.pylint test.pep8 test.unit gecko.driver test.robot | test: buildenv test.pylint test.pep8 test.unit gecko.driver test.robot | ||||||
| 
 | 
 | ||||||
| 
 | PYLINT_FILES=\
 | ||||||
| # TODO: balance linting with pylint
 |  | ||||||
| test.pylint: pyenvinstall |  | ||||||
| 	$(call cmd,pylint,\
 |  | ||||||
| 	searx/preferences.py \
 | 	searx/preferences.py \
 | ||||||
| 	searx/testing.py \
 | 	searx/testing.py \
 | ||||||
| 	searx/engines/gigablast.py \
 | 	searx/engines/gigablast.py \
 | ||||||
| 	searx/engines/deviantart.py \
 | 	searx/engines/deviantart.py \
 | ||||||
| 	) | 	searx/engines/digg.py | ||||||
|  | 
 | ||||||
|  | test.pylint: pyenvinstall | ||||||
|  | 	$(call cmd,pylint,$(PYLINT_FILES)) | ||||||
| 	$(call cmd,pylint,\
 | 	$(call cmd,pylint,\
 | ||||||
| 		--disable=$(PYLINT_SEARX_DISABLE_OPTION) \
 | 		--disable=$(PYLINT_SEARX_DISABLE_OPTION) \
 | ||||||
| 		--additional-builtins=$(PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES) \
 | 		--additional-builtins=$(PYLINT_ADDITIONAL_BUILTINS_FOR_ENGINES) \
 | ||||||
|  | @ -249,7 +249,7 @@ test.sh: | ||||||
| 
 | 
 | ||||||
| test.pep8: pyenvinstall | test.pep8: pyenvinstall | ||||||
| 	@echo "TEST      pycodestyle (formerly pep8)" | 	@echo "TEST      pycodestyle (formerly pep8)" | ||||||
| 	$(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, searx/engines/gigablast.py, searx/engines/deviantart.py' \
 | 	$(Q)$(PY_ENV_ACT); pycodestyle --exclude='searx/static, searx/languages.py, $(foreach f,$(PYLINT_FILES),$(f),)' \
 | ||||||
|         --max-line-length=120 --ignore "E117,E252,E402,E722,E741,W503,W504,W605" searx tests |         --max-line-length=120 --ignore "E117,E252,E402,E722,E741,W503,W504,W605" searx tests | ||||||
| 
 | 
 | ||||||
| test.unit: pyenvinstall | test.unit: pyenvinstall | ||||||
|  |  | ||||||
|  | @ -1,7 +1,7 @@ | ||||||
| """ | """ | ||||||
|  Digg (News, Social media) |  Digg (News, Social media) | ||||||
| 
 | 
 | ||||||
|  @website     https://digg.com/ |  @website     https://digg.com | ||||||
|  @provide-api no |  @provide-api no | ||||||
| 
 | 
 | ||||||
|  @using-api   no |  @using-api   no | ||||||
|  | @ -9,59 +9,58 @@ | ||||||
|  @stable      no (HTML can change) |  @stable      no (HTML can change) | ||||||
|  @parse       url, title, content, publishedDate, thumbnail |  @parse       url, title, content, publishedDate, thumbnail | ||||||
| """ | """ | ||||||
|  | # pylint: disable=missing-function-docstring | ||||||
| 
 | 
 | ||||||
| import random |  | ||||||
| import string |  | ||||||
| from json import loads | from json import loads | ||||||
| from urllib.parse import urlencode | from urllib.parse import urlencode | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| 
 | 
 | ||||||
|  | from lxml import html | ||||||
|  | 
 | ||||||
| # engine dependent config | # engine dependent config | ||||||
| categories = ['news', 'social media'] | categories = ['news', 'social media'] | ||||||
| paging = True | paging = True | ||||||
|  | base_url = 'https://digg.com' | ||||||
| 
 | 
 | ||||||
| # search-url | # search-url | ||||||
| base_url = 'https://digg.com/' | search_url = base_url + ( | ||||||
| search_url = base_url + 'api/search/?{query}&from={position}&size=20&format=html' |     '/api/search/' | ||||||
|  |     '?{query}' | ||||||
|  |     '&from={position}' | ||||||
|  |     '&size=20' | ||||||
|  |     '&format=html' | ||||||
|  | ) | ||||||
| 
 | 
 | ||||||
| # specific xpath variables |  | ||||||
| results_xpath = '//article' |  | ||||||
| link_xpath = './/small[@class="time"]//a' |  | ||||||
| title_xpath = './/h2//a//text()' |  | ||||||
| content_xpath = './/p//text()' |  | ||||||
| pubdate_xpath = './/time' |  | ||||||
| 
 |  | ||||||
| digg_cookie_chars = string.ascii_uppercase + string.ascii_lowercase +\ |  | ||||||
|     string.digits + "+_" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # do search-request |  | ||||||
| def request(query, params): | def request(query, params): | ||||||
|     offset = (params['pageno'] - 1) * 20 |     offset = (params['pageno'] - 1) * 20 | ||||||
|     params['url'] = search_url.format(position=offset, |     params['url'] = search_url.format( | ||||||
|                                       query=urlencode({'q': query})) |         query = urlencode({'q': query}), | ||||||
|     params['cookies']['frontend.auid'] = ''.join(random.choice( |         position = offset, | ||||||
|         digg_cookie_chars) for _ in range(22)) |     ) | ||||||
|     return params |     return params | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
| # get response from search-request |  | ||||||
| def response(resp): | def response(resp): | ||||||
|     results = [] |     results = [] | ||||||
| 
 | 
 | ||||||
|     search_result = loads(resp.text) |  | ||||||
| 
 |  | ||||||
|     # parse results |     # parse results | ||||||
|     for result in search_result['mapped']: |     for result in loads(resp.text)['mapped']: | ||||||
| 
 | 
 | ||||||
|         published = datetime.strptime(result['created']['ISO'], "%Y-%m-%d %H:%M:%S") |         # strip html tags and superfluous quotation marks from content | ||||||
|         # append result |         content = html.document_fromstring( | ||||||
|         results.append({'url': result['url'], |             result['excerpt'] | ||||||
|  |         ).text_content() | ||||||
|  | 
 | ||||||
|  |         # 'created': {'ISO': '2020-10-16T14:09:55Z', ...} | ||||||
|  |         published = datetime.strptime( | ||||||
|  |             result['created']['ISO'], '%Y-%m-%dT%H:%M:%SZ' | ||||||
|  |         ) | ||||||
|  |         results.append({ | ||||||
|  |             'url': result['url'], | ||||||
|             'title': result['title'], |             'title': result['title'], | ||||||
|                         'content': result['excerpt'], |             'content' : content, | ||||||
|             'template': 'videos.html', |             'template': 'videos.html', | ||||||
|             'publishedDate': published, |             'publishedDate': published, | ||||||
|                         'thumbnail': result['images']['thumbImage']}) |             'thumbnail': result['images']['thumbImage'], | ||||||
|  |         }) | ||||||
| 
 | 
 | ||||||
|     # return results |  | ||||||
|     return results |     return results | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 Alexandre Flament
						Alexandre Flament