Source code for bw2io.export.ecospold1

from datetime import datetime
from os import times
from pathlib import Path
from typing import Dict, Union

import numpy as np
from lxml import etree
from stats_arrays.distributions import (
    LognormalUncertainty,
    NormalUncertainty,
    NoUncertainty,
    TriangularUncertainty,
    UndefinedUncertainty,
    UniformUncertainty,
)

from .. import __version__ as version


[docs]
attr_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", "schemaLocation")


[docs]
nsmap = {
    None: "http://www.EcoInvent.org/EcoSpold01",
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
}


if isinstance(version, tuple):

[docs]
    version = ".".join([str(x) for x in version])




[docs]
def bool_to_text(b: Union[bool, str]) -> str:
    if b in (True, "yes", "Yes", "true", "True"):
        return "true"
    elif b in (False, None, "", "False", "false", "No", "no"):
        return "false"
    else:
        raise ValueError(f"Can't convert {b} to boolean string")




[docs]
def stripper(obj: str, prefix: str) -> str:
    if obj.startswith(prefix):
        return obj[len(prefix) :]
    else:
        return obj




[docs]
def pretty_number(val: float) -> str:
    if 1e-2 < abs(val) < 1e2:
        return np.format_float_positional(val, precision=6, trim="0")
    else:
        return np.format_float_scientific(val, precision=6, trim="0")




[docs]
class Ecospold1Exporter:
    """Export one or more datasets to Ecospold1 XML.

    The combination of `Ecospold1DataExtractor` and `Ecospold1Exporter` does
    not give prefect roundtrip data flow, especially if data if coming from
    closed-source software with unspecified behaviour. The following
    differences have been observed:

    * This class uses an updated Ecospold1 set of XSDs from https://github.com/sami-m-g/pyecospold/tree/main/pyecospold/schemas/v1
    * The dataset `generator` field is different (`bw2io`)
    * The dataset `number` is not preserved
    * Number formatting is different. We round numbers like `10.2000000000000011` to `10.2`, and always keep at least one decimal point.
    * We always include an `uncertaintyType` for exchanges
    * We always include an `infrastructureProcess` for exchanges (default `false`)
    * The field `dataGeneratorAndPublication` is not used consistently - we always fill this with dummy data.
    * We eliminate duplicate identical `person` elements
    * `person` and `source` elements can be renumbered, but references are kept correct

    """

    def __init__(self, schema_location: Union[str, None] = None):

[docs]
        self.root = etree.Element(
            "ecoSpold",
            {
                attr_qname: schema_location
                or "https://raw.githubusercontent.com/sami-m-g/pyecospold/main/pyecospold/schemas/v1/EcoSpold01Dataset.xsd"
            },
            nsmap=nsmap,
        )


[docs]
        self.count = 0



[docs]
    def add_dataset(self, node: dict) -> None:
        self.count += 1
        tags = dict(node.get("tags", []))
        timestamp = tags.get("ecoSpold01timestamp", datetime.now().isoformat())

        dataset = etree.SubElement(
            self.root,
            "dataset",
            attrib={
                "validCompanyCodes": "CompanyCodes.xml",
                "validRegionalCodes": "RegionalCodes.xml",
                "validCategories": "Categories.xml",
                "validUnits": "Units.xml",
                # Can't guarantee that datasets come from same source
                # so input numbers aren't useful.
                # We reset the exchange numbers as well.
                # They can't be used in any case as they aren't implemented
                # consistently by different LCA software.
                "number": str(self.count),
                "timestamp": timestamp,
                "generator": f"bw2io {version}",
            },
        )
        meta_information = etree.SubElement(
            dataset,
            "metaInformation",
        )

        category = tags.get("ecoSpold01category", "")
        subcategory = tags.get("ecoSpold01subCategory", "")
        comments = node.get("comments", {})

        process_information = etree.SubElement(meta_information, "processInformation")
        etree.SubElement(
            process_information,
            "referenceFunction",
            attrib={
                "datasetRelatesToProduct": bool_to_text(
                    tags.get("ecoSpold01datasetRelatesToProduct", True)
                ),
                "name": node["name"],
                "localName": tags.get("ecoSpold01localName", node["name"]),
                "infrastructureProcess": bool_to_text(
                    tags.get("ecoSpold01infrastructureProcess")
                ),
                # This makes no sense, this number is defined in the relevant exchange
                # "Within the ecoinvent quality network the amount of the reference flow always equals 1."
                "amount": "1",
                "unit": node["unit"],
                "category": category,
                "subCategory": subcategory,
                "localCategory": tags.get("ecoSpold01localCategory", category),
                "localSubCategory": tags.get("ecoSpold01localSubCategory", subcategory),
                "includedProcesses": comments.get("includedProcesses", ""),
                "generalComment": comments.get("generalComment", ""),
                "infrastructureIncluded": bool_to_text(
                    tags.get("ecoSpold01infrastructureIncluded")
                ),
            },
        )
        etree.SubElement(
            process_information,
            "geography",
            attrib={
                "location": node.get("location", "GLO"),
                "text": stripper(comments.get("location", ""), "Location: "),
            },
        )
        etree.SubElement(
            process_information,
            "technology",
            attrib={"text": stripper(comments.get("technology", ""), "Technology: ")},
        )
        time_period = etree.SubElement(
            process_information,
            "timePeriod",
            attrib={
                "text": stripper(comments.get("timePeriod", ""), "Time period: "),
                "dataValidForEntirePeriod": bool_to_text(
                    tags.get("ecoSpold01dataValidForEntirePeriod", True)
                ),
            },
        )
        start = etree.SubElement(time_period, "startDate")
        start.text = tags.get("ecoSpold01startDate", "1970-01-01")
        end = etree.SubElement(time_period, "endDate")
        end.text = tags.get("ecoSpold01endDate", "1970-01-01")
        etree.SubElement(
            process_information,
            "dataSetInformation",
            attrib={
                "type": str(tags.get("ecoSpold01type", "1")),
                "impactAssessmentResult": bool_to_text(
                    tags.get("ecoSpold01impactAssessmentResult")
                ),
                "timestamp": timestamp,
                "version": tags.get("ecoSpold01version", "0.0"),
                "internalVersion": tags.get("ecoSpold01internalVersion", "0.0"),
                "energyValues": str(tags.get("ecoSpold01energyValues", "0")),
                "languageCode": tags.get("ecoSpold01languageCode", "en"),
                "localLanguageCode": tags.get("ecoSpold01localLanguageCode", "de"),
            },
        )
        m_and_v = etree.SubElement(meta_information, "modellingAndValidation")
        etree.SubElement(
            m_and_v,
            "representativeness",
            attrib={
                "productionVolume": stripper(
                    comments.get("productionVolume", "unknown"), "Production volume: "
                ),
                "samplingProcedure": stripper(
                    comments.get("sampling", "unknown"), "Sampling: "
                ),
                "extrapolations": stripper(
                    comments.get("extrapolations", "unknown"), "Extrapolations: "
                ),
                "uncertaintyAdjustments": stripper(
                    comments.get("uncertaintyAdjustments", "unknown"),
                    "Uncertainty adjustments: ",
                ),
            },
        )

        SOURCE_MAP: Dict[str, str] = {
            "Undefined (default)": "0",
            "Article": "1",
            "Chapters in anthology": "2",
            "Seperate publication": "3",
            "Measurement on site": "4",
            "Oral communication": "5",
            "Personal written communication": "6",
            "Questionnaries": "7",
        }

        SOURCE_FIELDS = {
            "nameOfEditors": "editors",
            "pageNumbers": "pages",
            "year": "year",
            "title": "title",
            "titleOfAnthology": "anthology",
            "placeOfPublications": "place_of_publication",
            "publisher": "publisher",
            "journal": "journal",
            "volumeNo": "volume",
            "issueNo": "issue",
            "text": "text",
        }

        for index, source in enumerate(node.get("references", [])):
            etree.SubElement(
                m_and_v,
                "source",
                attrib={
                    "number": str(source.get("identifier", index + 1)),
                    "sourceType": SOURCE_MAP.get(source.get("type"), "0"),
                    "firstAuthor": source.get("authors", [""])[0],
                    "additionalAuthors": (
                        source["authors"][1]
                        if len(source.get("authors", [])) > 1
                        else ""
                    ),
                }
                | {
                    k: str(source.get(v))
                    for k, v in SOURCE_FIELDS.items()
                    if source.get(v)
                },
            )

        admin = etree.SubElement(meta_information, "administrativeInformation")
        etree.SubElement(
            admin,
            "dataEntryBy",
            attrib={
                "number": str(source.get("identifier", index + 1)),
                "qualityNetwork": "1",
            },
        )
        etree.SubElement(
            admin,
            "dataGeneratorAndPublication",
            attrib={
                "person": str(
                    node.get("authors", {}).get("data_entry", {}).get("identifier", 1)
                ),
                "dataPublishedIn": "1",
                "referenceToPublishedSource": "1",
                "accessRestrictedTo": "0",
                "copyright": "true",
            },
        )

        PERSON_FIELDS = [
            ("identifier", "number", "1"),
            ("address", "address", ""),
            ("company", "companyCode", ""),
            ("country", "countryCode", ""),
            ("email", "email", ""),
            ("name", "name", ""),
        ]

        for person in node.get("authors", {}).get("people", []):
            etree.SubElement(
                admin,
                "person",
                attrib={b: str(person.get(a, c)) for a, b, c in PERSON_FIELDS},
            )

        RESOURCES = {
            "natural resource",
            "natural resources",
            "resource",
            "resources",
            "raw",
        }

        UNCERTAINTY_MAPPING = {
            None: "0",
            NoUncertainty.id: "0",
            UndefinedUncertainty.id: "0",
            LognormalUncertainty.id: "1",
            TriangularUncertainty.id: "3",
            UniformUncertainty.id: "4",
        }

        EXCHANGE_FIELDS = {
            "generalComment": "comment",
            "CASNumber": "CAS number",
            "location": "location",
            "formula": "chemical formula",
            "referenceToSource": "source_reference",
            "pageNumbers": "pages",
        }

        flow_data = etree.SubElement(dataset, "flowData")
        for index, exc in enumerate(node.get("exchanges", [])):
            attrs = {
                "number": str(index + 1),
                "unit": str(exc.get("unit")),
                "name": exc.get("name", ""),
                "meanValue": pretty_number(exc["amount"]),
                "infrastructureProcess": bool_to_text(exc.get("infrastructureProcess")),
            } | {k: exc.get(v) for k, v in EXCHANGE_FIELDS.items() if exc.get(v)}

            if exc.get("uncertainty type") is not None:
                attrs["uncertaintyType"] = UNCERTAINTY_MAPPING.get(
                    exc.get("uncertainty type")
                )
            if exc.get("categories") and exc["categories"][0]:
                attrs["category"] = exc["categories"][0] or ""
            if len(exc.get("categories")) > 1 and exc["categories"][1]:
                attrs["subCategory"] = exc["categories"][1] or ""

            if exc.get("uncertainty type") == LognormalUncertainty.id and exc.get(
                "scale"
            ):
                attrs["standardDeviation95"] = pretty_number(np.exp(exc["scale"]) ** 2)
            elif exc.get("uncertainty type") == NormalUncertainty.id and exc.get(
                "scale"
            ):
                attrs["standardDeviation95"] = pretty_number(exc["scale"] * 2)

            if exc.get("minimum"):
                attrs["minValue"] = pretty_number(exc["minimum"])
            if exc.get("maximum"):
                attrs["maxValue"] = pretty_number(exc["maximum"])

            exc_element = etree.SubElement(
                flow_data,
                "exchange",
                attrib=attrs,
            )
            if exc["type"] == "technosphere":
                elem = etree.SubElement(exc_element, "inputGroup")
                elem.text = "5"
            elif exc["type"] == "production":
                elem = etree.SubElement(exc_element, "outputGroup")
                elem.text = "0"
            elif exc["type"] == "substitution":
                elem = etree.SubElement(exc_element, "outputGroup")
                elem.text = "1"
            elif exc["type"] == "biosphere":
                if exc["categories"][0].lower() in RESOURCES:
                    elem = etree.SubElement(exc_element, "inputGroup")
                    elem.text = "5"
                else:
                    elem = etree.SubElement(exc_element, "outputGroup")
                    elem.text = "4"
            else:
                raise ValueError("Can't map exchange type {}".format(exc["type"]))


    @property

[docs]
    def bytes(self) -> bytes:
        return etree.tostring(
            self.root, encoding="utf-8", xml_declaration=True, pretty_print=True
        )


    def __repr__(self) -> str:
        return self.bytes.decode("utf-8")


[docs]
    def write_to_file(self, filepath: Path) -> None:
        with open(filepath, "wb") as f:
            f.write(self.bytes)