searxng/searx/engines/bing_news.py

"""
 Bing (News)

 @website     https://www.bing.com/news
 @provide-api yes (http://datamarket.azure.com/dataset/bing/search),
              max. 5000 query/month

 @using-api   no (because of query limit)
 @results     HTML (using search portal)
 @stable      no (HTML can change)
 @parse       url, title, content, publishedDate
"""

from urllib import urlencode
from cgi import escape
from lxml import html
from datetime import datetime, timedelta
from dateutil import parser
import re
from searx.engines.xpath import extract_text

# engine dependent config
categories = ['news']
paging = True
language_support = True

# search-url
base_url = 'https://www.bing.com/'
search_string = 'news/search?{query}&first={offset}'


# do search-request
def request(query, params):
    offset = (params['pageno'] - 1) * 10 + 1

    if params['language'] == 'all':
        language = 'en-US'
    else:
        language = params['language'].replace('_', '-')

    search_path = search_string.format(
        query=urlencode({'q': query, 'setmkt': language}),
        offset=offset)

    params['cookies']['_FP'] = "ui=en-US"

    params['url'] = base_url + search_path

    return params


# get response from search-request
def response(resp):
    results = []

    dom = html.fromstring(resp.content)

    # parse results
    for result in dom.xpath('//div[@class="sn_r"]'):
        link = result.xpath('.//div[@class="newstitle"]/a')[0]
        url = link.attrib.get('href')
        title = extract_text(link)
        contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
        content = escape(extract_text(contentXPath))

        # parse publishedDate
        publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
                                          '//div[contains(@class,"sn_ST")]'
                                          '//span[contains(@class,"sn_tm")]')

        publishedDate = escape(extract_text(publishedDateXPath))

        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now()\
                - timedelta(hours=int(timeNumbers[0]))\
                - timedelta(minutes=int(timeNumbers[1]))
        elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
            timeNumbers = re.findall(r'\d+', publishedDate)
            publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
        else:
            try:
                publishedDate = parser.parse(publishedDate, dayfirst=False)
            except TypeError:
                publishedDate = datetime.now()
            except ValueError:
                publishedDate = datetime.now()

        # append result
        results.append({'url': url,
                        'title': title,
                        'publishedDate': publishedDate,
                        'content': content})

    # return results
    return results
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00			`"""`
			`Bing (News)`

			`@website https://www.bing.com/news`
			`@provide-api yes (http://datamarket.azure.com/dataset/bing/search),`
			`max. 5000 query/month`

			`@using-api no (because of query limit)`
			`@results HTML (using search portal)`
			`@stable no (HTML can change)`
			`@parse url, title, content, publishedDate`
			`"""`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00
Create bing_news.py 2014-03-04 13:10:04 +01:00			`from urllib import urlencode`
			`from cgi import escape`
			`from lxml import html`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`from datetime import datetime, timedelta`
			`from dateutil import parser`
			`import re`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00			`from searx.engines.xpath import extract_text`
Create bing_news.py 2014-03-04 13:10:04 +01:00
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`# engine dependent config`
Create bing_news.py 2014-03-04 13:10:04 +01:00			`categories = ['news']`
			`paging = True`
			`language_support = True`

update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`# search-url`
			`base_url = 'https://www.bing.com/'`
			`search_string = 'news/search?{query}&first={offset}'`
Create bing_news.py 2014-03-04 13:10:04 +01:00
little refactoring 2014-09-02 17:13:44 +02:00
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`# do search-request`
Create bing_news.py 2014-03-04 13:10:04 +01:00			`def request(query, params):`
			`offset = (params['pageno'] - 1) * 10 + 1`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00
Create bing_news.py 2014-03-04 13:10:04 +01:00			`if params['language'] == 'all':`
			`language = 'en-US'`
			`else:`
			`language = params['language'].replace('_', '-')`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00
Create bing_news.py 2014-03-04 13:10:04 +01:00			`search_path = search_string.format(`
			`query=urlencode({'q': query, 'setmkt': language}),`
			`offset=offset)`

Change the cookie in bing_news to use the english interface But still uses the language to set the market, and so provide relevant results to the language. Fix #198 2015-01-22 22:46:34 +01:00			`params['cookies']['_FP'] = "ui=en-US"`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00
Create bing_news.py 2014-03-04 13:10:04 +01:00			`params['url'] = base_url + search_path`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00
Create bing_news.py 2014-03-04 13:10:04 +01:00			`return params`


update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`# get response from search-request`
Create bing_news.py 2014-03-04 13:10:04 +01:00			`def response(resp):`
			`results = []`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00
Create bing_news.py 2014-03-04 13:10:04 +01:00			`dom = html.fromstring(resp.content)`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00
			`# parse results`
			`for result in dom.xpath('//div[@class="sn_r"]'):`
			`link = result.xpath('.//div[@class="newstitle"]/a')[0]`
Create bing_news.py 2014-03-04 13:10:04 +01:00			`url = link.attrib.get('href')`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00			`title = extract_text(link)`
			`contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')`
Bing news engine corrections XPath never return None. (I found the HTML report of coverage) 2015-01-29 21:19:59 +01:00			`content = escape(extract_text(contentXPath))`
[fix] pep8 : engines (errors E121, E127, E128 and E501 still exist) 2014-12-07 16:37:56 +01:00
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`# parse publishedDate`
[fix] pep8 2014-12-16 17:26:16 +01:00			`publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'`
[fix] bing_news 2015-05-27 12:08:50 +02:00			`'//div[contains(@class,"sn_ST")]'`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00			`'//span[contains(@class,"sn_tm")]')`

Bing news engine corrections XPath never return None. (I found the HTML report of coverage) 2015-01-29 21:19:59 +01:00			`publishedDate = escape(extract_text(publishedDateXPath))`
Create bing_news.py 2014-03-04 13:10:04 +01:00
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`if re.match("^[0-9]+ minute(s\|) ago$", publishedDate):`
			`timeNumbers = re.findall(r'\d+', publishedDate)`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00			`publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`elif re.match("^[0-9]+ hour(s\|) ago$", publishedDate):`
			`timeNumbers = re.findall(r'\d+', publishedDate)`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00			`publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))`
			`elif re.match("^[0-9]+ hour(s\|), [0-9]+ minute(s\|) ago$", publishedDate):`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`timeNumbers = re.findall(r'\d+', publishedDate)`
			`publishedDate = datetime.now()\`
			`- timedelta(hours=int(timeNumbers[0]))\`
			`- timedelta(minutes=int(timeNumbers[1]))`
[fix] bing_new engine : fix published date parsing 2014-09-07 18:10:05 +02:00			`elif re.match("^[0-9]+ day(s\|) ago$", publishedDate):`
			`timeNumbers = re.findall(r'\d+', publishedDate)`
Bing news' unit test I have no idea why coverage tell 97% and 2 misses in branches. If anyone has an idea... 2015-01-29 20:56:57 +01:00			`publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`else:`
[fix] bing_new engine : fix published date parsing 2014-09-07 18:10:05 +02:00			`try:`
			`publishedDate = parser.parse(publishedDate, dayfirst=False)`
			`except TypeError:`
			`publishedDate = datetime.now()`
update versions.cfg to use the current up-to-date packages 2015-05-02 15:45:17 +02:00			`except ValueError:`
			`publishedDate = datetime.now()`
[fix] pep8 : engines (errors E121, E127, E128 and E501 still exist) 2014-12-07 16:37:56 +01:00
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`# append result`
[fix] pep8 : engines (errors E121, E127, E128 and E501 still exist) 2014-12-07 16:37:56 +01:00			`results.append({'url': url,`
			`'title': title,`
update bing engines and fix bing_news 2014-09-01 14:38:59 +02:00			`'publishedDate': publishedDate,`
			`'content': content})`

			`# return results`
Create bing_news.py 2014-03-04 13:10:04 +01:00			`return results`