From a1d9c81915b169272cf26139445f3e08e9b689b9 Mon Sep 17 00:00:00 2001 From: Robin Schneider Date: Tue, 31 Dec 2019 14:24:27 +0100 Subject: [PATCH 01/10] Fix Nginx subdir URL install docs which allowed download of settings.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes: #1617 There is an issue with the setup example in https://asciimoo.github.io/searx/dev/install/installation.html#installation for subdirectory URL deployments: ```nginx root /usr/local/searx; location = /searx { rewrite ^ /searx/; } try_files $uri @searx; } location @searx { uwsgi_param SCRIPT_NAME /searx; include uwsgi_params; uwsgi_modifier1 30; uwsgi_pass unix:/run/uwsgi/app/searx/socket; } ``` `try_files` causes Nginx to search for files in the server root first. If it matches a file, it is returned. Only if no file matched, the request is passed to uwsgi. The worst consequence I can think of is that `settings.yml` can be downloaded without authentication (where secrets and configuration details are stored). To fix this, I propose: ```nginx location = /searx { rewrite ^ /searx/; } location /searx/static { } location /searx { uwsgi_param SCRIPT_NAME /searx; include uwsgi_params; uwsgi_pass unix:/run/uwsgi/app/searx/socket; } ``` And add ``` route-run = fixpathinfo: ``` to `/etc/uwsgi/apps-available/searx.ini` because `uwsgi_modifier1 30` is apparently deprecated. Ref: https://uwsgi-docs.readthedocs.io/en/latest/Changelog-2.0.11.html#fixpathinfo-routing-action I assume this issue exists because some uwsgi upstream docs also use the `try_files` construct (at least I have seen this somewhere in the docs or somewhere else on the Internet but cannot find it right now again). https://uwsgi-docs.readthedocs.io/en/latest/Nginx.html#hosting-multiple-apps-in-the-same-process-aka-managing-script-name-and-path-info also warns about this: > If used incorrectly a configuration like this may cause security problems. For your sanity’s sake, double-triple-quadruple check that your application files, configuration files and any other sensitive files are outside of the root of the static files. --- docs/admin/installation.rst | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/admin/installation.rst b/docs/admin/installation.rst index 239ce0704..28a6b0614 100644 --- a/docs/admin/installation.rst +++ b/docs/admin/installation.rst @@ -114,6 +114,9 @@ content: # Module to import module = searx.webapp + # Support running the module from a webserver subdirectory. + route-run = fixpathinfo: + # Virtualenv and python path virtualenv = /usr/local/searx/searx-ve/ pythonpath = /usr/local/searx/ @@ -180,14 +183,16 @@ Add this configuration in the server config file .. code:: nginx - location = /searx { rewrite ^ /searx/; } - location /searx { - try_files $uri @searx; + location = /searx { + rewrite ^ /searx/; } - location @searx { + + location /searx/static { + } + + location /searx { uwsgi_param SCRIPT_NAME /searx; include uwsgi_params; - uwsgi_modifier1 30; uwsgi_pass unix:/run/uwsgi/app/searx/socket; } @@ -338,4 +343,3 @@ References * How to: `Setup searx in a couple of hours with a free SSL certificate `__ - From 088337295aaeebf8a37d6b4e859cd59019cd3d27 Mon Sep 17 00:00:00 2001 From: Robin Schneider Date: Tue, 31 Dec 2019 14:37:01 +0100 Subject: [PATCH 02/10] Simply Nginx example by using alias directive for subdirectory URL We explicitly specific the static directory here using alias to allow to host from a other subdirectory than "searx" which just so happens to match the source code directory. --- docs/admin/installation.rst | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/admin/installation.rst b/docs/admin/installation.rst index 28a6b0614..e0b3779fa 100644 --- a/docs/admin/installation.rst +++ b/docs/admin/installation.rst @@ -183,11 +183,8 @@ Add this configuration in the server config file .. code:: nginx - location = /searx { - rewrite ^ /searx/; - } - location /searx/static { + alias /usr/local/searx/searx/static; } location /searx { From 3e5a3ee4e49c739fdc464d47252c684a42620d48 Mon Sep 17 00:00:00 2001 From: Robin Schneider Date: Tue, 31 Dec 2019 14:38:30 +0100 Subject: [PATCH 03/10] Let Nginx deliver static files directory in all examples --- docs/admin/installation.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/admin/installation.rst b/docs/admin/installation.rst index e0b3779fa..15800fc01 100644 --- a/docs/admin/installation.rst +++ b/docs/admin/installation.rst @@ -154,7 +154,10 @@ content: server { listen 80; server_name searx.example.com; - root /usr/local/searx; + root /usr/local/searx/searx; + + location /static { + } location / { include uwsgi_params; @@ -199,6 +202,10 @@ in case of single-user or low-traffic instances.) .. code:: nginx + location /searx/static { + alias /usr/local/searx/searx/static; + } + location /searx { proxy_pass http://127.0.0.1:8888; proxy_set_header Host $host; From a1b85571a25d67b752bf6072255b928866be9c4f Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:28:18 +0100 Subject: [PATCH 04/10] [fix] tmp suspend insecure engines --- searx/settings.yml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/searx/settings.yml b/searx/settings.yml index 2a2d2bf87..2777f9caa 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -79,9 +79,10 @@ engines: categories : science timeout : 4.0 - - name : base - engine : base - shortcut : bs +# tmp suspended: dh key too small +# - name : base +# engine : base +# shortcut : bs - name : wikipedia engine : wikipedia @@ -552,10 +553,11 @@ engines: timeout : 10.0 disabled : True - - name : scanr structures - shortcut: scs - engine : scanr_structures - disabled : True +# tmp suspended: bad certificate +# - name : scanr structures +# shortcut: scs +# engine : scanr_structures +# disabled : True - name : soundcloud engine : soundcloud From 2292e6e130dca104cb324197b63611a012e4ef3c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:28:47 +0100 Subject: [PATCH 05/10] [fix] handle missing result size --- searx/engines/bing.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index ed0b87dbd..24776c400 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -63,6 +63,8 @@ def response(resp): results = [] result_len = 0 + + dom = html.fromstring(resp.text) # parse results for result in eval_xpath(dom, '//div[@class="sa_cc"]'): @@ -89,8 +91,7 @@ def response(resp): 'content': content}) try: - result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]/text()')) - result_len_container = utils.to_string(result_len_container) + result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()')) if "-" in result_len_container: # Remove the part "from-to" for paginated request ... result_len_container = result_len_container[result_len_container.find("-") * 2 + 2:] @@ -102,7 +103,7 @@ def response(resp): logger.debug('result error :\n%s', e) pass - if _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: + if result_len and _get_offset_from_pageno(resp.search_params.get("pageno", 0)) > result_len: return [] results.append({'number_of_results': result_len}) From 2dc2e1e8f9c8ae0d28df56f42b2f4949d8611624 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:29:10 +0100 Subject: [PATCH 06/10] [fix] skip invalid encoded attributes --- searx/engines/flickr_noapi.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 198ac2cff..e1abb378f 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -109,14 +109,22 @@ def response(resp): else: url = build_flickr_url(photo['ownerNsid'], photo['id']) - results.append({'url': url, - 'title': title, - 'img_src': img_src, - 'thumbnail_src': thumbnail_src, - 'content': content, - 'author': author, - 'source': source, - 'img_format': img_format, - 'template': 'images.html'}) + result = { + 'url': url, + 'img_src': img_src, + 'thumbnail_src': thumbnail_src, + 'source': source, + 'img_format': img_format, + 'template': 'images.html' + } + try: + result['author'] = author.encode('utf-8') + result['title'] = title.encode('utf-8') + result['content'] = content.encode('utf-8') + except: + result['author'] = '' + result['title'] = '' + result['content'] = '' + results.append(result) return results From 86a378bd0109684bd45c917f94068e3c98441904 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:29:28 +0100 Subject: [PATCH 07/10] [fix] handle missing thumbnail --- searx/engines/ina.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/searx/engines/ina.py b/searx/engines/ina.py index 37a05f099..ea509649f 100644 --- a/searx/engines/ina.py +++ b/searx/engines/ina.py @@ -32,7 +32,7 @@ base_url = 'https://www.ina.fr' search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' # specific xpath variables -results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' +results_xpath = '//div[contains(@class,"search-results--list")]//div[@class="media-body"]' url_xpath = './/a/@href' title_xpath = './/h3[@class="h3--title media-heading"]' thumbnail_xpath = './/img/@src' @@ -65,8 +65,11 @@ def response(resp): videoid = result.xpath(url_xpath)[0] url = base_url + videoid title = p.unescape(extract_text(result.xpath(title_xpath))) - thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) - if thumbnail[0] == '/': + try: + thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) + except: + thumbnail = '' + if thumbnail and thumbnail[0] == '/': thumbnail = base_url + thumbnail d = extract_text(result.xpath(publishedDate_xpath)[0]) d = d.split('/') From 1e6253ce16346fc6f439a07211b56770d06ba225 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:29:55 +0100 Subject: [PATCH 08/10] [fix] handle empty response --- searx/engines/microsoft_academic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/searx/engines/microsoft_academic.py b/searx/engines/microsoft_academic.py index 9387b08d0..9bac0069c 100644 --- a/searx/engines/microsoft_academic.py +++ b/searx/engines/microsoft_academic.py @@ -45,6 +45,8 @@ def request(query, params): def response(resp): results = [] response_data = loads(resp.text) + if not response_data: + return results for result in response_data['results']: url = _get_url(result) From ad5bb994b1cff56c4f021f88bfa62f38055f1416 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:30:18 +0100 Subject: [PATCH 09/10] [fix] add py3 compatibility --- searx/engines/scanr_structures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py index 72fd2b3c9..7208dcb70 100644 --- a/searx/engines/scanr_structures.py +++ b/searx/engines/scanr_structures.py @@ -29,7 +29,7 @@ def request(query, params): params['url'] = search_url params['method'] = 'POST' params['headers']['Content-type'] = "application/json" - params['data'] = dumps({"query": query, + params['data'] = dumps({"query": query.decode('utf-8'), "searchField": "ALL", "sortDirection": "ASC", "sortOrder": "RELEVANCY", From 17b6faa4c3c1cf14a327f4a3538fc70dce08b756 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Thu, 2 Jan 2020 22:37:06 +0100 Subject: [PATCH 10/10] [fix] pep8 --- searx/engines/bing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 24776c400..b193f7c60 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -63,8 +63,6 @@ def response(resp): results = [] result_len = 0 - - dom = html.fromstring(resp.text) # parse results for result in eval_xpath(dom, '//div[@class="sa_cc"]'):