From 617495cca8b2799945be2c2b042dcc7ce905741a Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Tue, 9 Jun 2015 16:16:07 +0200 Subject: [PATCH 1/3] Add a plugin to remove trackers from results URLs --- searx/plugins/__init__.py | 4 ++- searx/plugins/tracker_url_remover.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 searx/plugins/tracker_url_remover.py diff --git a/searx/plugins/__init__.py b/searx/plugins/__init__.py index 5ac3f447c..d61eb6073 100644 --- a/searx/plugins/__init__.py +++ b/searx/plugins/__init__.py @@ -21,7 +21,8 @@ logger = logger.getChild('plugins') from searx.plugins import (https_rewrite, self_ip, - search_on_category_select) + search_on_category_select, + tracker_url_remover) required_attrs = (('name', str), ('description', str), @@ -73,3 +74,4 @@ plugins = PluginStore() plugins.register(https_rewrite) plugins.register(self_ip) plugins.register(search_on_category_select) +plugins.register(tracker_url_remover) diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py new file mode 100644 index 000000000..c8459548f --- /dev/null +++ b/searx/plugins/tracker_url_remover.py @@ -0,0 +1,40 @@ +''' +searx is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +searx is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with searx. If not, see < http://www.gnu.org/licenses/ >. + +(C) 2015 by Adam Tauber, +''' + +from flask.ext.babel import gettext +import re + +re1 = re.compile(r'utm_[^&]+&?') +re2 = re.compile(r'(wkey|wemail)[^&]+&?') +re3 = re.compile(r'&$') +re4 = re.compile(r'^\?$') + +name = gettext('Tracker URL remover') +description = gettext('Remove trackers arguments from the returned URL') +default_on = True + + +def on_result(request, ctx): + url = ctx['result']['url'] + + url = re1.sub('', url) + url = re2.sub('', url) + url = re3.sub('', url) + url = re4.sub('', url) + + ctx['result']['url'] = url + return True From e93f5314d745ca389858fdf53f355d8c28928507 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Mon, 15 Jun 2015 20:34:02 +0200 Subject: [PATCH 2/3] A bit of cleanup of the code - regexes in a array - regexes applied only on the last part of the url --- searx/plugins/tracker_url_remover.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py index c8459548f..f6ecc2126 100644 --- a/searx/plugins/tracker_url_remover.py +++ b/searx/plugins/tracker_url_remover.py @@ -18,10 +18,9 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. from flask.ext.babel import gettext import re -re1 = re.compile(r'utm_[^&]+&?') -re2 = re.compile(r'(wkey|wemail)[^&]+&?') -re3 = re.compile(r'&$') -re4 = re.compile(r'^\?$') +regexes = {re.compile(r'utm_[^&]+&?'), + re.compile(r'(wkey|wemail)[^&]+&?'), + re.compile(r'&$')} name = gettext('Tracker URL remover') description = gettext('Remove trackers arguments from the returned URL') @@ -29,12 +28,17 @@ default_on = True def on_result(request, ctx): - url = ctx['result']['url'] + splited_url = ctx['result']['url'].split('?') - url = re1.sub('', url) - url = re2.sub('', url) - url = re3.sub('', url) - url = re4.sub('', url) + if len(splited_url) is not 2: + return True + + for reg in regexes: + splited_url[1] = reg.sub('', splited_url[1]) + + if splited_url[1] == "": + ctx['result']['url'] = splited_url[0] + else: + ctx['result']['url'] = splited_url[0] + '?' + splited_url[1] - ctx['result']['url'] = url return True From 8911233e3e1d1b6f1ed1c81577c012de57ca62b8 Mon Sep 17 00:00:00 2001 From: Cqoicebordel Date: Thu, 18 Jun 2015 14:27:15 +0200 Subject: [PATCH 3/3] Use parsed_url --- searx/plugins/tracker_url_remover.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/searx/plugins/tracker_url_remover.py b/searx/plugins/tracker_url_remover.py index f6ecc2126..ed71c94d3 100644 --- a/searx/plugins/tracker_url_remover.py +++ b/searx/plugins/tracker_url_remover.py @@ -17,6 +17,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. from flask.ext.babel import gettext import re +from urlparse import urlunparse regexes = {re.compile(r'utm_[^&]+&?'), re.compile(r'(wkey|wemail)[^&]+&?'), @@ -28,17 +29,16 @@ default_on = True def on_result(request, ctx): - splited_url = ctx['result']['url'].split('?') + query = ctx['result']['parsed_url'].query - if len(splited_url) is not 2: + if query == "": return True for reg in regexes: - splited_url[1] = reg.sub('', splited_url[1]) + query = reg.sub('', query) - if splited_url[1] == "": - ctx['result']['url'] = splited_url[0] - else: - ctx['result']['url'] = splited_url[0] + '?' + splited_url[1] + if query != ctx['result']['parsed_url'].query: + ctx['result']['parsed_url'] = ctx['result']['parsed_url']._replace(query=query) + ctx['result']['url'] = urlunparse(ctx['result']['parsed_url']) return True