From c3daa08537668c24224fffecbed4347fee936fcf Mon Sep 17 00:00:00 2001
From: a01200356 <a01200356@itesm.mx>
Date: Thu, 19 May 2016 00:38:43 -0500
Subject: [PATCH] [enh] Add onions category with Ahmia, Not Evil and Torch

Xpath engine and results template changed to account for the fact that
archive.org doesn't cache .onions, though some onion engines migth have
their own cache.

Disabled by default. Can be enabled by setting the SOCKS proxies to
wherever Tor is listening and setting using_tor_proxy as True.

Requires Tor and updating packages.

To avoid manually adding the timeout on each engine, you can set
extra_proxy_timeout to account for Tor's (or whatever proxy used) extra
time.
---
 .gitignore                                    |   1 +
 searx/engines/__init__.py                     |  14 +-
 searx/engines/ahmia.py                        |  82 ++++++++++++
 searx/engines/not_evil.py                     |  64 +++++++++
 searx/engines/xpath.py                        |  36 +++++-
 searx/settings.yml                            |  31 ++++-
 .../legacy/result_templates/default.html      |   7 +-
 searx/templates/oscar/macros.html             |  12 +-
 searx/webapp.py                               |   1 +
 tests/unit/engines/test_xpath.py              | 121 ++++++++++++++++++
 tests/unit/test_engines_init.py               |  44 +++++++
 11 files changed, 399 insertions(+), 14 deletions(-)
 create mode 100644 searx/engines/ahmia.py
 create mode 100644 searx/engines/not_evil.py
 create mode 100644 tests/unit/engines/test_xpath.py
 create mode 100644 tests/unit/test_engines_init.py

diff --git a/.gitignore b/.gitignore
index e56a575ab..b1286ea66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ setup.cfg
 *.pyc
 */*.pyc
 *~
+*.swp
 
 /node_modules
 
diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py
index 9cdca47b7..00be89412 100644
--- a/searx/engines/__init__.py
+++ b/searx/engines/__init__.py
@@ -142,6 +142,17 @@ def load_engine(engine_data):
         engine.stats['page_load_time'] = 0
         engine.stats['page_load_count'] = 0
 
+    # tor related settings
+    if settings['outgoing'].get('using_tor_proxy'):
+        # use onion url if using tor.
+        if hasattr(engine, 'onion_url'):
+            engine.search_url = engine.onion_url + getattr(engine, 'search_path', '')
+    elif 'onions' in engine.categories:
+        # exclude onion engines if not using tor.
+        return None
+
+    engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0)
+
     for category_name in engine.categories:
         categories.setdefault(category_name, []).append(engine)
 
@@ -252,8 +263,9 @@ def get_engines_stats(preferences):
 
 
 def load_engines(engine_list):
-    global engines
+    global engines, engine_shortcuts
     engines.clear()
+    engine_shortcuts.clear()
     for engine_data in engine_list:
         engine = load_engine(engine_data)
         if engine is not None:
diff --git a/searx/engines/ahmia.py b/searx/engines/ahmia.py
new file mode 100644
index 000000000..d9fcc6ca7
--- /dev/null
+++ b/searx/engines/ahmia.py
@@ -0,0 +1,82 @@
+"""
+ Ahmia (Onions)
+
+ @website      http://msydqstlz2kzerdg.onion
+ @provides-api no
+
+ @using-api    no
+ @results      HTML
+ @stable       no
+ @parse        url, title, content
+"""
+
+from urllib.parse import urlencode, urlparse, parse_qs
+from lxml.html import fromstring
+from searx.engines.xpath import extract_url, extract_text
+
+# engine config
+categories = ['onions']
+paging = True
+page_size = 10
+
+# search url
+search_url = 'http://msydqstlz2kzerdg.onion/search/?{query}'
+time_range_support = True
+time_range_dict = {'day': 1,
+                   'week': 7,
+                   'month': 30}
+
+# xpaths
+results_xpath = '//li[@class="result"]'
+url_xpath = './h4/a/@href'
+title_xpath = './h4/a[1]'
+content_xpath = './/p[1]'
+correction_xpath = '//*[@id="didYouMean"]//a'
+number_of_results_xpath = '//*[@id="totalResults"]'
+
+
+def request(query, params):
+    params['url'] = search_url.format(query=urlencode({'q': query}))
+
+    if params['time_range'] in time_range_dict:
+        params['url'] += '&' + urlencode({'d': time_range_dict[params['time_range']]})
+
+    return params
+
+
+def response(resp):
+    results = []
+    dom = fromstring(resp.text)
+
+    # trim results so there's not way too many at once
+    first_result_index = page_size * (resp.search_params.get('pageno', 1) - 1)
+    all_results = dom.xpath(results_xpath)
+    trimmed_results = all_results[first_result_index:first_result_index + page_size]
+
+    # get results
+    for result in trimmed_results:
+        # remove ahmia url and extract the actual url for the result
+        raw_url = extract_url(result.xpath(url_xpath), search_url)
+        cleaned_url = parse_qs(urlparse(raw_url).query).get('redirect_url', [''])[0]
+
+        title = extract_text(result.xpath(title_xpath))
+        content = extract_text(result.xpath(content_xpath))
+
+        results.append({'url': cleaned_url,
+                        'title': title,
+                        'content': content,
+                        'is_onion': True})
+
+    # get spelling corrections
+    for correction in dom.xpath(correction_xpath):
+        results.append({'correction': extract_text(correction)})
+
+    # get number of results
+    number_of_results = dom.xpath(number_of_results_xpath)
+    if number_of_results:
+        try:
+            results.append({'number_of_results': int(extract_text(number_of_results))})
+        except:
+            pass
+
+    return results
diff --git a/searx/engines/not_evil.py b/searx/engines/not_evil.py
new file mode 100644
index 000000000..e84f153bd
--- /dev/null
+++ b/searx/engines/not_evil.py
@@ -0,0 +1,64 @@
+"""
+ not Evil (Onions)
+
+ @website     http://hss3uro2hsxfogfq.onion
+ @provide-api yes (http://hss3uro2hsxfogfq.onion/api.htm)
+
+ @using-api   no
+ @results     HTML
+ @stable      no
+ @parse       url, title, content
+"""
+
+from urllib.parse import urlencode
+from lxml import html
+from searx.engines.xpath import extract_text
+
+# engine dependent config
+categories = ['onions']
+paging = True
+page_size = 20
+
+# search-url
+base_url = 'http://hss3uro2hsxfogfq.onion/'
+search_url = 'index.php?{query}&hostLimit=20&start={pageno}&numRows={page_size}'
+
+# specific xpath variables
+results_xpath = '//*[@id="content"]/div/p'
+url_xpath = './span[1]'
+title_xpath = './a[1]'
+content_xpath = './text()'
+
+
+# do search-request
+def request(query, params):
+    offset = (params['pageno'] - 1) * page_size
+
+    params['url'] = base_url + search_url.format(pageno=offset,
+                                                 query=urlencode({'q': query}),
+                                                 page_size=page_size)
+
+    return params
+
+
+# get response from search-request
+def response(resp):
+    results = []
+
+    # needed because otherwise requests guesses wrong encoding
+    resp.encoding = 'utf8'
+    dom = html.fromstring(resp.text)
+
+    # parse results
+    for result in dom.xpath(results_xpath):
+        url = extract_text(result.xpath(url_xpath)[0])
+        title = extract_text(result.xpath(title_xpath)[0])
+        content = extract_text(result.xpath(content_xpath))
+
+        # append result
+        results.append({'url': url,
+                        'title': title,
+                        'content': content,
+                        'is_onion': True})
+
+    return results
diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py
index a269253d7..81c2747fb 100644
--- a/searx/engines/xpath.py
+++ b/searx/engines/xpath.py
@@ -10,6 +10,8 @@ thumbnail_xpath = False
 paging = False
 suggestion_xpath = ''
 results_xpath = ''
+cached_xpath = ''
+cached_url = ''
 
 # parameters for engines with paging support
 #
@@ -36,6 +38,8 @@ def request(query, params):
 def response(resp):
     results = []
     dom = html.fromstring(resp.text)
+    is_onion = True if 'onions' in categories else False
+
     if results_xpath:
         for result in eval_xpath(dom, results_xpath):
             url = extract_url(eval_xpath(result, url_xpath), search_url)
@@ -49,15 +53,33 @@ def response(resp):
                 if len(thumbnail_xpath_result) > 0:
                     tmp_result['img_src'] = extract_url(thumbnail_xpath_result, search_url)
 
+            # add alternative cached url if available
+            if cached_xpath:
+                tmp_result['cached_url'] = cached_url + extract_text(result.xpath(cached_xpath))
+
+            if is_onion:
+                tmp_result['is_onion'] = True
+
             results.append(tmp_result)
     else:
-        for url, title, content in zip(
-            (extract_url(x, search_url) for
-             x in eval_xpath(dom, url_xpath)),
-            map(extract_text, eval_xpath(dom, title_xpath)),
-            map(extract_text, eval_xpath(dom, content_xpath))
-        ):
-            results.append({'url': url, 'title': title, 'content': content})
+        if cached_xpath:
+            for url, title, content, cached in zip(
+                (extract_url(x, search_url) for
+                 x in dom.xpath(url_xpath)),
+                map(extract_text, dom.xpath(title_xpath)),
+                map(extract_text, dom.xpath(content_xpath)),
+                map(extract_text, dom.xpath(cached_xpath))
+            ):
+                results.append({'url': url, 'title': title, 'content': content,
+                                'cached_url': cached_url + cached, 'is_onion': is_onion})
+        else:
+            for url, title, content in zip(
+                (extract_url(x, search_url) for
+                 x in dom.xpath(url_xpath)),
+                map(extract_text, dom.xpath(title_xpath)),
+                map(extract_text, dom.xpath(content_xpath))
+            ):
+                results.append({'url': url, 'title': title, 'content': content, 'is_onion': is_onion})
 
     if not suggestion_xpath:
         return results
diff --git a/searx/settings.yml b/searx/settings.yml
index b23f48b45..54352bbfc 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -60,8 +60,10 @@ outgoing: # communication with search engines
 # see http://docs.python-requests.org/en/latest/user/advanced/#proxies
 # SOCKS proxies are also supported: see http://requests.readthedocs.io/en/master/user/advanced/#socks
 #    proxies :
-#        http : http://127.0.0.1:8080
-#        https: http://127.0.0.1:8080
+#        http : socks5h://127.0.0.1:9050
+#        https: socks5h://127.0.0.1:9050
+#    using_tor_proxy : True
+#    extra_proxy_timeout : 10.0 # Extra seconds to add in order to account for the time taken by the proxy
 # uncomment below section only if you have more than one network interface
 # which can be the source of outgoing search requests
 #    source_ips:
@@ -89,6 +91,12 @@ engines:
     shortcut: apkm
     disabled: True
 
+# Requires Tor
+  - name : ahmia
+    engine : ahmia
+    categories : onions
+    shortcut : ah
+
   - name : arch linux wiki
     engine : archlinux
     shortcut : al
@@ -185,7 +193,7 @@ engines:
   - name : deviantart
     engine : deviantart
     shortcut : da
-    timeout: 3.0
+    timeout : 3.0
 
   - name : ddg definitions
     engine : duckduckgo_definitions
@@ -514,6 +522,11 @@ engines:
     timeout: 5.0
     shortcut : npm
 
+# Requires Tor
+  - name : not evil
+    engine : not_evil
+    shortcut : ne
+
   - name : nyaa
     engine : nyaa
     shortcut : nt
@@ -698,6 +711,18 @@ engines:
     url: https://torrentz2.eu/
     timeout : 3.0
 
+# Requires Tor
+  - name : torch
+    engine : xpath
+    paging : True
+    search_url : http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
+    results_xpath : //table//tr
+    url_xpath : ./td[2]/a
+    title_xpath : ./td[2]/b
+    content_xpath : ./td[2]/small
+    categories : onions
+    shortcut : tch
+
   - name : twitter
     engine : twitter
     shortcut : tw
diff --git a/searx/templates/legacy/result_templates/default.html b/searx/templates/legacy/result_templates/default.html
index 13e2d2913..78bf031df 100644
--- a/searx/templates/legacy/result_templates/default.html
+++ b/searx/templates/legacy/result_templates/default.html
@@ -1,6 +1,11 @@
 <div class="result {{ result.class }}{% for e in result.engines %} {{ e }}{% endfor %}">
     <h3 class="result_title">{% if "icon_"~result.engine~".ico" in favicons %}<img width="14" height="14" class="favicon" src="{{ url_for('static', filename='img/icons/icon_'+result.engine+'.ico') }}" alt="{{result.engine}}" />{% endif %}<a href="{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ result.title|safe }}</a></h3>
-    <p class="url">{{ result.pretty_url }}&lrm; <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
+    <p class="url">{{ result.pretty_url }}&lrm;
+    {% if result.cached_url %}
+        <a class="cache_link" href="{{ result.cached_url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
+    {% elif not result.is_onion %}
+        <a class="cache_link" href="https://web.archive.org/web/{{ result.url }}" {% if results_on_new_tab %}target="_blank" rel="noopener noreferrer"{% else %}rel="noreferrer"{% endif %}>{{ _('cached') }}</a>
+    {% endif %}
     {% if result.publishedDate %}<span class="published_date">{{ result.publishedDate }}</span>{% endif %}</p>
     <p class="content">{% if result.img_src %}<img src="{{ image_proxify(result.img_src) }}" class="image" />{% endif %}{% if result.content %}{{ result.content|safe }}<br class="last"/>{% endif %}</p>
 </div>
diff --git a/searx/templates/oscar/macros.html b/searx/templates/oscar/macros.html
index f52d9713c..57a90aaa2 100644
--- a/searx/templates/oscar/macros.html
+++ b/searx/templates/oscar/macros.html
@@ -32,7 +32,11 @@
             <span class="label label-default">{{ engine }}</span>
         {%- endfor -%}
         {%- if result.url -%}
-        <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
+            {% if result.cached_url %}
+            <small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
+            {% elif not result.is_onion %}
+            <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
+            {% endif %}
         {%- endif -%}
         {%- if proxify -%}
         <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
@@ -50,7 +54,11 @@
         <span class="label label-default">{{ engine }}</span>
     {%- endfor %}
     {%- if result.url -%}
-    <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
+        {% if result.cached_url %}
+        <small>{{ result_link(result.cached_url, icon('link') + _('cached'), "text-info", id) }}</small>
+        {% elif not result.is_onion %}
+        <small>{{ result_link("https://web.archive.org/web/" + result.url, icon('link') + _('cached'), "text-info", id) }}</small>
+        {% endif %}
     {%- endif -%}
     {% if proxify -%}
     <small>{{ result_link(proxify(result.url), icon('sort') + _('proxied'), "text-info", id) }}</small>
diff --git a/searx/webapp.py b/searx/webapp.py
index cf9a09778..609669b85 100755
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -146,6 +146,7 @@ _category_names = (gettext('files'),
                    gettext('it'),
                    gettext('news'),
                    gettext('map'),
+                   gettext('onions'),
                    gettext('science'))
 
 outgoing_proxies = settings['outgoing'].get('proxies') or None
diff --git a/tests/unit/engines/test_xpath.py b/tests/unit/engines/test_xpath.py
new file mode 100644
index 000000000..963a44a25
--- /dev/null
+++ b/tests/unit/engines/test_xpath.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+from collections import defaultdict
+import mock
+from searx.engines import xpath
+from searx.testing import SearxTestCase
+
+
+class TestXpathEngine(SearxTestCase):
+
+    def test_request(self):
+        xpath.search_url = 'https://url.com/{query}'
+        xpath.categories = []
+        xpath.paging = False
+        query = 'test_query'
+        dicto = defaultdict(dict)
+        params = xpath.request(query, dicto)
+        self.assertIn('url', params)
+        self.assertEquals('https://url.com/test_query', params['url'])
+
+        xpath.search_url = 'https://url.com/q={query}&p={pageno}'
+        xpath.paging = True
+        query = 'test_query'
+        dicto = defaultdict(dict)
+        dicto['pageno'] = 1
+        params = xpath.request(query, dicto)
+        self.assertIn('url', params)
+        self.assertEquals('https://url.com/q=test_query&p=1', params['url'])
+
+    def test_response(self):
+        # without results_xpath
+        xpath.url_xpath = '//div[@class="search_result"]//a[@class="result"]/@href'
+        xpath.title_xpath = '//div[@class="search_result"]//a[@class="result"]'
+        xpath.content_xpath = '//div[@class="search_result"]//p[@class="content"]'
+
+        self.assertRaises(AttributeError, xpath.response, None)
+        self.assertRaises(AttributeError, xpath.response, [])
+        self.assertRaises(AttributeError, xpath.response, '')
+        self.assertRaises(AttributeError, xpath.response, '[]')
+
+        response = mock.Mock(text='<html></html>')
+        self.assertEqual(xpath.response(response), [])
+
+        html = u"""
+        <div>
+            <div class="search_result">
+                <a class="result" href="https://result1.com">Result 1</a>
+                <p class="content">Content 1</p>
+                <a class="cached" href="https://cachedresult1.com">Cache</a>
+            </div>
+            <div class="search_result">
+                <a class="result" href="https://result2.com">Result 2</a>
+                <p class="content">Content 2</p>
+                <a class="cached" href="https://cachedresult2.com">Cache</a>
+            </div>
+        </div>
+        """
+        response = mock.Mock(text=html)
+        results = xpath.response(response)
+        self.assertEqual(type(results), list)
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0]['title'], 'Result 1')
+        self.assertEqual(results[0]['url'], 'https://result1.com/')
+        self.assertEqual(results[0]['content'], 'Content 1')
+        self.assertEqual(results[1]['title'], 'Result 2')
+        self.assertEqual(results[1]['url'], 'https://result2.com/')
+        self.assertEqual(results[1]['content'], 'Content 2')
+
+        # with cached urls, without results_xpath
+        xpath.cached_xpath = '//div[@class="search_result"]//a[@class="cached"]/@href'
+        results = xpath.response(response)
+        self.assertEqual(type(results), list)
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
+        self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
+        self.assertFalse(results[0].get('is_onion', False))
+
+        # results are onion urls (no results_xpath)
+        xpath.categories = ['onions']
+        results = xpath.response(response)
+        self.assertTrue(results[0]['is_onion'])
+
+        # with results_xpath
+        xpath.results_xpath = '//div[@class="search_result"]'
+        xpath.url_xpath = './/a[@class="result"]/@href'
+        xpath.title_xpath = './/a[@class="result"]'
+        xpath.content_xpath = './/p[@class="content"]'
+        xpath.cached_xpath = None
+        xpath.categories = []
+
+        self.assertRaises(AttributeError, xpath.response, None)
+        self.assertRaises(AttributeError, xpath.response, [])
+        self.assertRaises(AttributeError, xpath.response, '')
+        self.assertRaises(AttributeError, xpath.response, '[]')
+
+        response = mock.Mock(text='<html></html>')
+        self.assertEqual(xpath.response(response), [])
+
+        response = mock.Mock(text=html)
+        results = xpath.response(response)
+        self.assertEqual(type(results), list)
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0]['title'], 'Result 1')
+        self.assertEqual(results[0]['url'], 'https://result1.com/')
+        self.assertEqual(results[0]['content'], 'Content 1')
+        self.assertEqual(results[1]['title'], 'Result 2')
+        self.assertEqual(results[1]['url'], 'https://result2.com/')
+        self.assertEqual(results[1]['content'], 'Content 2')
+
+        # with cached urls, with results_xpath
+        xpath.cached_xpath = './/a[@class="cached"]/@href'
+        results = xpath.response(response)
+        self.assertEqual(type(results), list)
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0]['cached_url'], 'https://cachedresult1.com')
+        self.assertEqual(results[1]['cached_url'], 'https://cachedresult2.com')
+        self.assertFalse(results[0].get('is_onion', False))
+
+        # results are onion urls (with results_xpath)
+        xpath.categories = ['onions']
+        results = xpath.response(response)
+        self.assertTrue(results[0]['is_onion'])
diff --git a/tests/unit/test_engines_init.py b/tests/unit/test_engines_init.py
new file mode 100644
index 000000000..cf4d50309
--- /dev/null
+++ b/tests/unit/test_engines_init.py
@@ -0,0 +1,44 @@
+from searx.testing import SearxTestCase
+from searx import settings, engines
+
+
+class TestEnginesInit(SearxTestCase):
+
+    @classmethod
+    def tearDownClass(cls):
+        settings['outgoing']['using_tor_proxy'] = False
+        settings['outgoing']['extra_proxy_timeout'] = 0
+
+    def test_initialize_engines_default(self):
+        engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1'},
+                       {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2'}]
+
+        engines.initialize_engines(engine_list)
+        self.assertEqual(len(engines.engines), 2)
+        self.assertIn('engine1', engines.engines)
+        self.assertIn('engine2', engines.engines)
+
+    def test_initialize_engines_exclude_onions(self):
+        settings['outgoing']['using_tor_proxy'] = False
+        engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'},
+                       {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
+
+        engines.initialize_engines(engine_list)
+        self.assertEqual(len(engines.engines), 1)
+        self.assertIn('engine1', engines.engines)
+        self.assertNotIn('onions', engines.categories)
+
+    def test_initialize_engines_include_onions(self):
+        settings['outgoing']['using_tor_proxy'] = True
+        settings['outgoing']['extra_proxy_timeout'] = 100.0
+        engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general',
+                        'timeout': 20.0, 'onion_url': 'http://engine1.onion'},
+                       {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}]
+
+        engines.initialize_engines(engine_list)
+        self.assertEqual(len(engines.engines), 2)
+        self.assertIn('engine1', engines.engines)
+        self.assertIn('engine2', engines.engines)
+        self.assertIn('onions', engines.categories)
+        self.assertIn('http://engine1.onion', engines.engines['engine1'].search_url)
+        self.assertEqual(engines.engines['engine1'].timeout, 120.0)