From 6f49bd8f08fbd52c67e21477fc2d21553d8220d7 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 17 Mar 2014 11:43:00 +0100 Subject: [PATCH 1/4] adding publish date to video-results and rss-feed --- searx/templates/opensearch_response_rss.xml | 1 + searx/templates/result_templates/videos.html | 1 + searx/webapp.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/searx/templates/opensearch_response_rss.xml b/searx/templates/opensearch_response_rss.xml index 417b195a3..5673eb2e1 100644 --- a/searx/templates/opensearch_response_rss.xml +++ b/searx/templates/opensearch_response_rss.xml @@ -16,6 +16,7 @@ {{ r.title }} {{ r.url }} {{ r.content }} + {% if r.pubdate %}{{ r.pubdate }}{% endif %} {% endfor %} diff --git a/searx/templates/result_templates/videos.html b/searx/templates/result_templates/videos.html index d3391f0d3..ab869a6eb 100644 --- a/searx/templates/result_templates/videos.html +++ b/searx/templates/result_templates/videos.html @@ -5,6 +5,7 @@

{{ result.title|safe }}

+ {% if result.publishedDate %}

{{ result.publishedDate }}

{% endif %}  {{ result.title }}

{{ result.url }}

diff --git a/searx/webapp.py b/searx/webapp.py index a52dd6289..d06c58f32 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -168,6 +168,8 @@ def index(): else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: + # TODO using right timezone + result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S +0000') result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': From fd86bf8189683aee72b934c8dd7544aa362a0728 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 18 Mar 2014 13:19:23 +0100 Subject: [PATCH 2/4] fix bug, to display publishDate with timezone without error --- searx/webapp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/searx/webapp.py b/searx/webapp.py index d06c58f32..a4c05a9e9 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -159,8 +159,8 @@ def index(): # TODO, check if timezone is calculated right if 'publishedDate' in result: - if result['publishedDate'] >= datetime.now() - timedelta(days=1): - timedifference = datetime.now() - result['publishedDate'] + if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1): + timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None) minutes = int((timedifference.seconds / 60) % 60) hours = int(timedifference.seconds / 60 / 60) if hours == 0: @@ -168,8 +168,7 @@ def index(): else: result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes) # noqa else: - # TODO using right timezone - result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S +0000') + result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S %z') result['publishedDate'] = format_date(result['publishedDate']) if search.request_data.get('format') == 'json': From 337bd6d907503176eb94290c3f386ce88167dea8 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 18 Mar 2014 13:19:50 +0100 Subject: [PATCH 3/4] simplify datetime extraction --- requirements.txt | 1 + searx/engines/google_news.py | 14 ++++---------- searx/engines/yahoo_news.py | 5 ++--- searx/engines/youtube.py | 7 +++++++ setup.py | 1 + 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0f69bc883..88c1bc715 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ flask-babel grequests lxml pyyaml +python-dateutil diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 43ccaa3e3..b8a7be3ee 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -2,6 +2,7 @@ from urllib import urlencode from json import loads +from dateutil import parser from datetime import datetime categories = ['news'] @@ -32,16 +33,9 @@ def response(resp): return [] for result in search_res['responseData']['results']: -# S.149 (159), library.pdf -# datetime.strptime("Mon, 10 Mar 2014 16:26:15 -0700", -# "%a, %d %b %Y %H:%M:%S %z") -# publishedDate = parse(result['publishedDate']) - publishedDate = datetime.strptime( - str.join(' ', result['publishedDate'].split(None)[0:5]), - "%a, %d %b %Y %H:%M:%S") - #utc_offset = timedelta(result['publishedDate'].split(None)[5]) - # local = utc + offset - #publishedDate = publishedDate + utc_offset + +# Mon, 10 Mar 2014 16:26:15 -0700 + publishedDate = parser.parse(result['publishedDate']) results.append({'url': result['unescapedUrl'], 'title': result['titleNoFormatting'], diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index a1e9df59c..53c8b07a7 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -6,6 +6,7 @@ from searx.engines.xpath import extract_text, extract_url from searx.engines.yahoo import parse_url from datetime import datetime, timedelta import re +from dateutil import parser categories = ['news'] search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}' @@ -52,9 +53,7 @@ def response(resp): - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) else: - # TODO year in string possible? - publishedDate = datetime.strptime(publishedDate, - "%b %d %H:%M%p") + publishedDate =parser.parse(publishedDate) if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year) diff --git a/searx/engines/youtube.py b/searx/engines/youtube.py index 5b04f3513..f6b08b330 100644 --- a/searx/engines/youtube.py +++ b/searx/engines/youtube.py @@ -1,5 +1,7 @@ from json import loads from urllib import urlencode +from dateutil import parser +from datetime import datetime categories = ['videos'] @@ -35,6 +37,10 @@ def response(resp): content = '' thumbnail = '' +#"2013-12-31T15:22:51.000Z" + pubdate = result['published']['$t'] + publishedDate = parser.parse(pubdate) + if result['media$group']['media$thumbnail']: thumbnail = result['media$group']['media$thumbnail'][0]['url'] content += ''.format(url, thumbnail) # noqa @@ -48,6 +54,7 @@ def response(resp): 'title': title, 'content': content, 'template': 'videos.html', + 'publishedDate': publishedDate, 'thumbnail': thumbnail}) return results diff --git a/setup.py b/setup.py index fff709f2f..79f2acc42 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ setup( 'lxml', 'pyyaml', 'setuptools', + 'python-dateutil', ], extras_require={ 'test': [ From 993271bed30e24c7ae1e0f63b64e030829206f27 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 18 Mar 2014 15:56:22 +0100 Subject: [PATCH 4/4] extract publishDate from vimeo --- searx/engines/vimeo.py | 6 ++++++ searx/engines/yahoo_news.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index a95c75b49..d2d2a4dd0 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -2,6 +2,8 @@ from urllib import urlencode from HTMLParser import HTMLParser from lxml import html from xpath import extract_text +from datetime import datetime +from dateutil import parser base_url = 'http://vimeo.com' search_url = base_url + '/search?{query}' @@ -10,6 +12,7 @@ content_xpath = None title_xpath = None results_xpath = '' content_tpl = ' ' +publishedDate_xpath = './/p[@class="meta"]//attribute::datetime' # the cookie set by vimeo contains all the following values, # but only __utma seems to be requiered @@ -40,9 +43,12 @@ def response(resp): url = base_url + result.xpath(url_xpath)[0] title = p.unescape(extract_text(result.xpath(title_xpath))) thumbnail = extract_text(result.xpath(content_xpath)[0]) + publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0])) + results.append({'url': url, 'title': title, 'content': content_tpl.format(url, title, thumbnail), 'template': 'videos.html', + 'publishedDate': publishedDate, 'thumbnail': thumbnail}) return results diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 53c8b07a7..43da93ede 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -53,7 +53,7 @@ def response(resp): - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) else: - publishedDate =parser.parse(publishedDate) + publishedDate = parser.parse(publishedDate) if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year)