fix Wikipedia's paragraph extraction

This commit is contained in:
Marc Abonce Seguin 2020-07-26 23:27:16 -07:00
parent 6d18769ccf
commit 77b9faa8df

View File

@ -49,29 +49,6 @@ def request(query, params):
return params return params
# get first meaningful paragraph
# this should filter out disambiguation pages and notes above first paragraph
# "magic numbers" were obtained by fine tuning
def extract_first_paragraph(content, title, image):
first_paragraph = None
failed_attempts = 0
for paragraph in content.split('\n'):
starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
length = len(paragraph)
if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
first_paragraph = paragraph
break
failed_attempts += 1
if failed_attempts > 3:
return None
return first_paragraph
# get response from search-request # get response from search-request
def response(resp): def response(resp):
results = [] results = []
@ -97,10 +74,7 @@ def response(resp):
if image: if image:
image = image.get('source') image = image.get('source')
extract = page.get('extract') summary = page.get('extract', '').split('\n')[0].replace('()', '')
summary = extract_first_paragraph(extract, title, image)
summary = summary.replace('() ', '')
# link to wikipedia article # link to wikipedia article
wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \