diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 081b69d3e..7e545d5e7 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -123,6 +123,46 @@ def highlight_content(content, query): return content +def score_results(results): + flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) + flat_len = len(flat_res) + engines_len = len(results) + results = [] + # deduplication + scoring + for i,res in enumerate(flat_res): + res['parsed_url'] = urlparse(res['url']) + res['engines'] = [res['engine']] + weight = 1.0 + if hasattr(engines[res['engine']], 'weight'): + weight = float(engines[res['engine']].weight) + elif res['engine'] in settings.weights: + weight = float(settings.weights[res['engine']]) + score = int((flat_len - i)/engines_len)*weight+1 + duplicated = False + for new_res in results: + p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path + p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path + if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ + p1 == p2 and\ + res['parsed_url'].query == new_res['parsed_url'].query and\ + res.get('template') == new_res.get('template'): + duplicated = new_res + break + if duplicated: + if len(res.get('content', '')) > len(duplicated.get('content', '')): + duplicated['content'] = res['content'] + duplicated['score'] += score + duplicated['engines'].append(res['engine']) + if duplicated['parsed_url'].scheme == 'https': + continue + elif res['parsed_url'].scheme == 'https': + duplicated['url'] = res['parsed_url'].geturl() + duplicated['parsed_url'] = res['parsed_url'] + else: + res['score'] = score + results.append(res) + return sorted(results, key=itemgetter('score'), reverse=True) + def search(query, request, selected_engines): global engines, categories, number_of_searches requests = [] @@ -165,43 +205,8 @@ def search(query, request, selected_engines): for engine_name,engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) - flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) - flat_len = len(flat_res) - engines_len = len(selected_engines) - results = [] - # deduplication + scoring - for i,res in enumerate(flat_res): - res['parsed_url'] = urlparse(res['url']) - res['engines'] = [res['engine']] - weight = 1.0 - if hasattr(engines[res['engine']], 'weight'): - weight = float(engines[res['engine']].weight) - elif res['engine'] in settings.weights: - weight = float(settings.weights[res['engine']]) - score = int((flat_len - i)/engines_len)*weight+1 - duplicated = False - for new_res in results: - p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path - p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path - if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\ - p1 == p2 and\ - res['parsed_url'].query == new_res['parsed_url'].query and\ - res.get('template') == new_res.get('template'): - duplicated = new_res - break - if duplicated: - if len(res.get('content', '')) > len(duplicated.get('content', '')): - duplicated['content'] = res['content'] - duplicated['score'] += score - duplicated['engines'].append(res['engine']) - if duplicated['parsed_url'].scheme == 'https': - continue - elif res['parsed_url'].scheme == 'https': - duplicated['url'] = res['parsed_url'].geturl() - duplicated['parsed_url'] = res['parsed_url'] - else: - res['score'] = score - results.append(res) + + results = score_results(results) for result in results: if 'content' in result: @@ -209,7 +214,7 @@ def search(query, request, selected_engines): for res_engine in result['engines']: engines[result['engine']].stats['score_count'] += result['score'] - return sorted(results, key=itemgetter('score'), reverse=True) + return results def get_engines_stats(): pageloads = []