Merge pull request #2094 from xywei/xpath-handle-relative-url

Extract relative urls that do not start with / using xpath engine
This commit is contained in:
Adam Tauber 2020-07-25 03:53:38 +02:00 committed by GitHub
commit 7b71954d7e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 4 additions and 0 deletions

View File

@ -61,6 +61,10 @@ def extract_url(xpath_results, search_url):
# fix relative url to the search engine # fix relative url to the search engine
url = urljoin(search_url, url) url = urljoin(search_url, url)
# fix relative urls that fall through the crack
if '://' not in url:
url = urljoin(search_url, url)
# normalize url # normalize url
url = normalize_url(url) url = normalize_url(url)