mirror of
				https://github.com/searxng/searxng
				synced 2024-01-01 19:24:07 +01:00 
			
		
		
		
	Merge pull request #452 from pointhi/engine_fix
[enh] fix content fetching, parse published date from description for startpage and ixquick
This commit is contained in:
		
						commit
						3a2f29344a
					
				
					 2 changed files with 40 additions and 9 deletions
				
			
		| 
						 | 
				
			
			@ -12,6 +12,8 @@
 | 
			
		|||
 | 
			
		||||
from lxml import html
 | 
			
		||||
from cgi import escape
 | 
			
		||||
from dateutil import parser
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
import re
 | 
			
		||||
from searx.engines.xpath import extract_text
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -79,15 +81,44 @@ def response(resp):
 | 
			
		|||
 | 
			
		||||
        title = escape(extract_text(link))
 | 
			
		||||
 | 
			
		||||
        if result.xpath('./p[@class="desc"]'):
 | 
			
		||||
            content = escape(extract_text(result.xpath('./p[@class="desc"]')))
 | 
			
		||||
        if result.xpath('./p[@class="desc clk"]'):
 | 
			
		||||
            content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
 | 
			
		||||
        else:
 | 
			
		||||
            content = ''
 | 
			
		||||
 | 
			
		||||
        # append result
 | 
			
		||||
        results.append({'url': url,
 | 
			
		||||
                        'title': title,
 | 
			
		||||
                        'content': content})
 | 
			
		||||
        published_date = None
 | 
			
		||||
 | 
			
		||||
        # check if search result starts with something like: "2 Sep 2014 ... "
 | 
			
		||||
        if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
 | 
			
		||||
            date_pos = content.find('...')+4
 | 
			
		||||
            date_string = content[0:date_pos-5]
 | 
			
		||||
            published_date = parser.parse(date_string, dayfirst=True)
 | 
			
		||||
 | 
			
		||||
            # fix content string
 | 
			
		||||
            content = content[date_pos:]
 | 
			
		||||
 | 
			
		||||
        # check if search result starts with something like: "5 days ago ... "
 | 
			
		||||
        elif re.match("^[0-9]+ days? ago \.\.\. ", content):
 | 
			
		||||
            date_pos = content.find('...')+4
 | 
			
		||||
            date_string = content[0:date_pos-5]
 | 
			
		||||
 | 
			
		||||
            # calculate datetime
 | 
			
		||||
            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
 | 
			
		||||
 | 
			
		||||
            # fix content string
 | 
			
		||||
            content = content[date_pos:]
 | 
			
		||||
 | 
			
		||||
        if published_date:
 | 
			
		||||
            # append result
 | 
			
		||||
            results.append({'url': url,
 | 
			
		||||
                            'title': title,
 | 
			
		||||
                            'content': content,
 | 
			
		||||
                            'publishedDate': published_date})
 | 
			
		||||
        else:
 | 
			
		||||
            # append result
 | 
			
		||||
            results.append({'url': url,
 | 
			
		||||
                            'title': title,
 | 
			
		||||
                            'content': content})
 | 
			
		||||
 | 
			
		||||
    # return results
 | 
			
		||||
    return results
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
 | 
			
		|||
                </a>
 | 
			
		||||
                <span id='title_stars_2' name='title_stars_2'>  </span>
 | 
			
		||||
            </h3>
 | 
			
		||||
            <p class='desc'>
 | 
			
		||||
            <p class='desc clk'>
 | 
			
		||||
                This should be the content.
 | 
			
		||||
            </p>
 | 
			
		||||
            <p>
 | 
			
		||||
| 
						 | 
				
			
			@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
 | 
			
		|||
                </a>
 | 
			
		||||
                <span id='title_stars_2' name='title_stars_2'>  </span>
 | 
			
		||||
            </h3>
 | 
			
		||||
            <p class='desc'>
 | 
			
		||||
            <p class='desc clk'>
 | 
			
		||||
                This should be the content.
 | 
			
		||||
            </p>
 | 
			
		||||
            <p>
 | 
			
		||||
| 
						 | 
				
			
			@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
 | 
			
		|||
            <h3>
 | 
			
		||||
                <span id='title_stars_2' name='title_stars_2'>  </span>
 | 
			
		||||
            </h3>
 | 
			
		||||
            <p class='desc'>
 | 
			
		||||
            <p class='desc clk'>
 | 
			
		||||
                This should be the content.
 | 
			
		||||
            </p>
 | 
			
		||||
            <p>
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue