Source code for bw2io.chemidplus

import json
from numbers import Number
from pathlib import Path
from urllib.parse import quote_plus

import requests


[docs]
DIRPATH = Path(__file__).parent.resolve() / "data"




[docs]
def canonical_cas(s):
    """
    CAS numbers have up to ten digits; we remove zero padding and add hyphens where needed.

    Parameters
    ----------
    s : str
        CAS number.

    Returns
    -------
    str
        Canonical CAS number.
    """
    if isinstance(s, Number):
        # Remove ".0" from string conversion
        s = int(s)
    if s in ("None", None) or not s:
        return
    try:
        s = str(int(str(s).replace("-", "")))
    except ValueError:
        # Dirty data
        return
    # TODO: Verify check number?
    return "{}-{}-{}".format(s[:-3], s[-3:-1], s[-1])




[docs]
class Multiple(Exception):
    """
    Multiple results for given search query.

    Parameters
    ----------
    exception : Exception
        Exception to raise.
    """

    pass




[docs]
class Missing(Exception):
    """
    404 or other error code returned.

    Parameters
    ----------
    exception : Exception
        Exception to raise.
    """

    pass




[docs]
class ChemIDPlus:
    """
    Use the `ChemIDPlus <https://chem.nlm.nih.gov/api/swagger-ui.html#/SubstanceController>`__ API to lookup synonyms for chemicals, including pesticides.

    Always used to match against a master list. Seeded with names from ecoinvent.

    Attributes
    ----------
    api_cache : dict
        Dictionary with raw data from API, key is canonical name.
    master_mapping : dict
        Dictionary from synonyms, including canonical names, to master flows.
    forbidden_keys : set
        Identifiers that aren't unique in the ChemIDPlus system.

    Methods
    -------
    match(synonym, search=True)
        Match a synonym to a master flow.
    match_cas(number)
        Match a CAS number to a master flow.
    process_request(request)
        Process a request to the ChemIDPlus API.
    load_cache()
        Load the cache of API results.
    save_cache()
        Save the cache of API results.
    """


[docs]
    CAS_TEMPLATE = (
        "https://chem.nlm.nih.gov/api/data/search?data=complete&exp=rn%2Feq%2F{cas}"
    )


[docs]
    NAME_TEMPLATE = (
        "https://chem.nlm.nih.gov/api/data/search?data=complete&exp=na%2Feq%2F{name}"
    )


    def __init__(self):
        # Dictionary with raw data from API, key is canonical name

[docs]
        self.api_cache = {}

        # Dictionary from synonyms, including canonical names, to master flows

[docs]
        self.master_mapping = {}

        # Identifiers that aren't unique in the ChemIDPlus system

[docs]
        self.forbidden_keys = set()

        if (DIRPATH / "chemid_cache.json").is_file():
            self.load_cache()


[docs]
    def match(self, synonym, search=True):
        synonym = str(synonym).lower()
        if synonym in self.forbidden_keys:
            return False
        try:
            return self.master_mapping[synonym]
        except KeyError:
            if not search:
                return False
            result = self.process_request(
                requests.get(self.NAME_TEMPLATE.format(name=quote_plus(synonym)))
            )
            master = self.master_mapping.get(result["canonical"].lower())
            if master:
                self.master_mapping[synonym] = master
                return master
            else:
                return False



[docs]
    def match_cas(self, number):
        return self.master_mapping[canonical_cas(number)]



[docs]
    def add_master_term(self, term, CAS):
        term = str(term).lower()
        if term in self.master_mapping.values():
            return
        try:
            result = self.process_request(
                requests.get(self.NAME_TEMPLATE.format(name=quote_plus(term)))
            )
        except (Missing, Multiple):
            if CAS:
                result = result = self.process_request(
                    requests.get(self.CAS_TEMPLATE.format(cas=quote_plus(CAS)))
                )
            else:
                raise Missing
        self.master_mapping[result["canonical"].lower()] = term
        self.api_cache[result["canonical"]] = result
        if result.get("CAS"):
            self.master_mapping[result["CAS"]] = term
        for synonym in result["synonyms"]:
            if synonym in self.master_mapping:
                self.forbidden_keys.add(synonym.lower())
                del self.master_mapping[synonym.lower()]
            else:
                self.master_mapping[synonym.lower()] = term



[docs]
    def save_cache(self):
        data = {
            "master_mapping": self.master_mapping,
            "api_cache": self.api_cache,
            "forbidden_keys": list(self.forbidden_keys),
        }
        json.dump(
            data, open(DIRPATH / "chemid_cache.json", "w"), ensure_ascii=False, indent=2
        )



[docs]
    def load_cache(self):
        data = json.load(open(DIRPATH / "chemid_cache.json"))
        self.forbidden_keys = set(data["forbidden_keys"])
        self.master_mapping = {k.lower(): v for k, v in data["master_mapping"].items()}
        self.api_cache = data["api_cache"]



[docs]
    def process_request(self, response):
        if not response.status_code == 200:
            raise Missing
        data = response.json()
        if not data["total"]:
            raise Missing
        elif not data["total"] == 1:
            raise Multiple
        data = data["results"][0]
        return {
            "CAS": data["summary"].get("rn"),
            "canonical": data["summary"]["na"],
            "synonyms": sorted(
                [
                    elem["d"]
                    for obj in data["names"]
                    for elem in obj["e"]
                    if obj["t"] == 616
                ]
            ),
        }