searxng/searx/engines/pubmed.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""PubMed (Scholar publications)

"""

from datetime import datetime
from urllib.parse import urlencode

from lxml import etree
from searx.network import get
from searx.utils import (
    eval_xpath_getindex,
    eval_xpath_list,
    extract_text,
)

# about
about = {
    "website": 'https://www.ncbi.nlm.nih.gov/pubmed/',
    "wikidata_id": 'Q1540899',
    "official_api_documentation": {
        'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',
        'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',
    },
    "use_official_api": True,
    "require_api_key": False,
    "results": 'XML',
}

categories = ['science', 'scientific publications']

base_url = (
    'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
)

# engine dependent config
number_of_results = 10
pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'


def request(query, params):
    # basic search
    offset = (params['pageno'] - 1) * number_of_results

    string_args = {
        'query': urlencode({'term': query}),
        'offset': offset,
        'hits': number_of_results,
    }

    params['url'] = base_url.format(**string_args)

    return params


def response(resp):  # pylint: disable=too-many-locals
    results = []

    # First retrieve notice of each result
    pubmed_retrieve_api_url = (
        'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'
    )

    pmids_results = etree.XML(resp.content)
    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
    pmids_string = ''

    for item in pmids:
        pmids_string += item.text + ','

    retrieve_notice_args = {'pmids_string': pmids_string}

    retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)

    search_results_response = get(retrieve_url_encoded).content
    search_results = etree.XML(search_results_response)
    for entry in eval_xpath_list(search_results, '//PubmedArticle'):
        medline = eval_xpath_getindex(entry, './MedlineCitation', 0)

        title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
        pmid = eval_xpath_getindex(medline, './/PMID', 0).text
        url = pubmed_url + pmid
        content = extract_text(
            eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
        )
        doi = extract_text(
            eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
        )
        journal = extract_text(
            eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
        )
        issn = extract_text(
            eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
        )
        authors = []
        for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
            f = eval_xpath_getindex(author, './ForeName', 0, default=None)
            l = eval_xpath_getindex(author, './LastName', 0, default=None)
            f = '' if f is None else f.text
            l = '' if l is None else l.text
            authors.append((f + ' ' + l).strip())

        res_dict = {
            'template': 'paper.html',
            'url': url,
            'title': title,
            'content': content or "",
            'journal': journal,
            'issn': [issn],
            'authors': authors,
            'doi': doi,
        }

        accepted_date = eval_xpath_getindex(
            entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
        )
        if accepted_date is not None:
            year = eval_xpath_getindex(accepted_date, './Year', 0)
            month = eval_xpath_getindex(accepted_date, './Month', 0)
            day = eval_xpath_getindex(accepted_date, './Day', 0)
            try:
                publishedDate = datetime.strptime(
                    year.text + '-' + month.text + '-' + day.text,
                    '%Y-%m-%d',
                )
                res_dict['publishedDate'] = publishedDate
            except Exception as e:  # pylint: disable=broad-exception-caught
                print(e)

        results.append(res_dict)

    return results
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# SPDX-License-Identifier: AGPL-3.0-or-later`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 07:45:08 +01:00			`"""PubMed (Scholar publications)`

[add] pubmed engine 2017-09-22 22:09:33 +02:00			`"""`

			`from datetime import datetime`
Drop Python 2 (1/n): remove unicode string and url_utils 2020-08-06 17:42:46 +02:00			`from urllib.parse import urlencode`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 07:45:08 +01:00
			`from lxml import etree`
[httpx] replace searx.poolrequests by searx.network settings.yml: * outgoing.networks: * can contains network definition * propertiers: enable_http, verify, http2, max_connections, max_keepalive_connections, keepalive_expiry, local_addresses, support_ipv4, support_ipv6, proxies, max_redirects, retries * retries: 0 by default, number of times searx retries to send the HTTP request (using different IP & proxy each time) * local_addresses can be "192.168.0.1/24" (it supports IPv6) * support_ipv4 & support_ipv6: both True by default see https://github.com/searx/searx/pull/1034 * each engine can define a "network" section: * either a full network description * either reference an existing network * all HTTP requests of engine use the same HTTP configuration (it was not the case before, see proxy configuration in master) 2021-04-05 10:43:33 +02:00			`from searx.network import get`
Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`from searx.utils import (`
			`eval_xpath_getindex,`
			`eval_xpath_list,`
			`extract_text,`
			`)`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`# about`
			`about = {`
			`"website": 'https://www.ncbi.nlm.nih.gov/pubmed/',`
			`"wikidata_id": 'Q1540899',`
			`"official_api_documentation": {`
			`'url': 'https://www.ncbi.nlm.nih.gov/home/develop/api/',`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`'comment': 'More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/',`
[enh] engines: add about variable move meta information from comment to the about variable so the preferences, the documentation can show these information 2021-01-13 11:31:25 +01:00			`},`
			`"use_official_api": True,`
			`"require_api_key": False,`
			`"results": 'XML',`
			`}`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`categories = ['science', 'scientific publications']`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`base_url = (`
			`'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'`
			`)`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
			`# engine dependent config`
			`number_of_results = 10`
			`pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'`


			`def request(query, params):`
			`# basic search`
			`offset = (params['pageno'] - 1) * number_of_results`

[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 07:45:08 +01:00			`string_args = {`
			`'query': urlencode({'term': query}),`
			`'offset': offset,`
			`'hits': number_of_results,`
			`}`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
			`params['url'] = base_url.format(**string_args)`

			`return params`


[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 07:45:08 +01:00			`def response(resp): # pylint: disable=too-many-locals`
[add] pubmed engine 2017-09-22 22:09:33 +02:00			`results = []`

			`# First retrieve notice of each result`
[format.python] initial formatting of the python code This patch was generated by black [1]:: make format.python [1] https://github.com/psf/black Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2021-12-27 09:26:22 +01:00			`pubmed_retrieve_api_url = (`
			`'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + 'db=pubmed&retmode=xml&id={pmids_string}'`
			`)`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
minor fixes of pubmed engine Closes #1045 2017-11-01 14:20:47 +01:00			`pmids_results = etree.XML(resp.content)`
[add] pubmed engine 2017-09-22 22:09:33 +02:00			`pmids = pmids_results.xpath('//eSearchResult/IdList/Id')`
			`pmids_string = ''`

			`for item in pmids:`
			`pmids_string += item.text + ','`

[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 07:45:08 +01:00			`retrieve_notice_args = {'pmids_string': pmids_string}`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
			`retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)`

Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`search_results_response = get(retrieve_url_encoded).content`
			`search_results = etree.XML(search_results_response)`
			`for entry in eval_xpath_list(search_results, '//PubmedArticle'):`
			`medline = eval_xpath_getindex(entry, './MedlineCitation', 0)`
[add] pubmed engine 2017-09-22 22:09:33 +02:00
Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text`
			`pmid = eval_xpath_getindex(medline, './/PMID', 0).text`
[add] pubmed engine 2017-09-22 22:09:33 +02:00			`url = pubmed_url + pmid`
Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`content = extract_text(`
			`eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True`
			`)`
			`doi = extract_text(`
			`eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True`
			`)`
			`journal = extract_text(`
			`eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True`
			`)`
			`issn = extract_text(`
			`eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True`
			`)`
			`authors = []`
			`for author in eval_xpath_list(medline, './Article/AuthorList/Author'):`
			`f = eval_xpath_getindex(author, './ForeName', 0, default=None)`
			`l = eval_xpath_getindex(author, './LastName', 0, default=None)`
			`f = '' if f is None else f.text`
			`l = '' if l is None else l.text`
			`authors.append((f + ' ' + l).strip())`

			`res_dict = {`
			`'template': 'paper.html',`
			`'url': url,`
			`'title': title,`
[fix] pubmed content being None 2023-09-14 12:41:15 +02:00			`'content': content or "",`
Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`'journal': journal,`
			`'issn': [issn],`
			`'authors': authors,`
			`'doi': doi,`
			`}`

			`accepted_date = eval_xpath_getindex(`
			`entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None`
			`)`
			`if accepted_date is not None:`
			`year = eval_xpath_getindex(accepted_date, './Year', 0)`
			`month = eval_xpath_getindex(accepted_date, './Month', 0)`
			`day = eval_xpath_getindex(accepted_date, './Day', 0)`
			`try:`
			`publishedDate = datetime.strptime(`
			`year.text + '-' + month.text + '-' + day.text,`
			`'%Y-%m-%d',`
			`)`
			`res_dict['publishedDate'] = publishedDate`
[mod] pylint all engines without PYLINT_SEARXNG_DISABLE_OPTION Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> 2024-03-11 07:45:08 +01:00			`except Exception as e: # pylint: disable=broad-exception-caught`
Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`print(e)`
[fix] do not crash if publication date is missing in pubmed engine 2017-12-01 20:48:10 +01:00
[add] pubmed engine 2017-09-22 22:09:33 +02:00			`results.append(res_dict)`

Science category: update the engines * use the paper.html template * fetch more data from the engines * add crossref.py 2022-08-26 18:10:12 +02:00			`return results`