Source code for neuromaps_mouse.datasets.utils

"""Functions for working with datasets."""

import os
import json
import shutil
import importlib.resources
from pathlib import Path

try:
    # nilearn 0.10.3
    from nilearn.datasets._utils import fetch_single_file as _fetch_file, _md5_sum_file
except ImportError:
    from nilearn.datasets.utils import _fetch_file, _md5_sum_file


[docs]def get_data_dir(data_dir=None):
    if data_dir is None:
        data_dir = os.environ.get(
            "MOUSEMAPS_DATA", str(Path.home() / "neuromaps-mouse-data")
        )
    data_dir = Path(data_dir).expanduser()
    data_dir.mkdir(parents=True, exist_ok=True)
    return data_dir


def _load_resource_json(relative_path):
    """
    Load JSON file from package resources.

    Parameters
    ----------
    relative_path : str
        Path to JSON file relative to package resources

    Returns
    -------
    resource_json : dict
        JSON file loaded as a dictionary
    """
    # handling pkg_resources.resource_filename deprecation
    if getattr(importlib.resources, "files", None) is not None:
        f_resource = importlib.resources.files("neuromaps_mouse") / relative_path
    else:
        from pkg_resources import resource_filename

        f_resource = resource_filename("neuromaps_mouse", relative_path)

    with open(f_resource) as src:
        resource_json = json.load(src)

    return resource_json


MOUSEMAPS_ATLASES = _load_resource_json("datasets/data/atlases.json")["atlases"]
MOUSEMAPS_ANNOTS = _load_resource_json("datasets/data/annotations.json")["annotations"]
MOUSEMAPS_ANNOTS_META = _load_resource_json("datasets/data/annotations-meta.json")[
    "annotations-meta"
]


def _osfify_url(osf_file_id):
    return f"https://osf.io/download/{osf_file_id}/"


def fetch_files(annotations, file_type="annotations", data_dir=None, verbose=1):
    targ_fname_list = []
    for annot in annotations:
        if file_type in ["annotations", "annotations-meta"]:
            targ_path = Path(data_dir) / "annotations"
        elif file_type == "atlases":
            targ_path = Path(data_dir) / "atlases"
        else:
            raise ValueError(f"Unknown file_type={file_type}")

        targ_fname = targ_path / annot["rel_path"] / annot["fname"]
        if targ_fname.exists():
            if _md5_sum_file(targ_fname) == annot["checksum"]:
                targ_fname_list.append(targ_fname)
                if verbose:
                    print(f"Found {targ_fname.name} at {targ_fname}")
                continue
            else:
                if verbose:
                    print(f"Checksum mismatch for {targ_fname.name}, redownloading...")

        dl_fname = _fetch_file(
            _osfify_url(annot["url"]["osf"]),
            targ_fname.parent,
            resume=True,
            md5sum=annot["checksum"],
            verbose=1,
        )
        shutil.move(dl_fname, targ_fname)
        targ_fname_list.append(targ_fname)

        if verbose:
            print(f"Downloaded {targ_fname.name} to {targ_fname}")

    return targ_fname_list


def _annot_full_to_tuple(full_list):
    return [
        tuple([annot[key] for key in ["source", "desc", "space", "res"]])
        for annot in full_list
    ]


def _match_annots_by_tuple(annot_tuple_list):
    # match all then sort
    if not isinstance(annot_tuple_list, list):
        annot_tuple_list = [annot_tuple_list]
    matched = []
    for annot_tuple in annot_tuple_list:
        found = False
        for annot in MOUSEMAPS_ANNOTS:
            curr_annot_tuple = tuple(
                [annot[key] for key in ["source", "desc", "space", "res"]]
            )
            if curr_annot_tuple == annot_tuple:
                matched.append(annot)
                found = True
                break
        if not found:
            raise ValueError(f"Annotation {annot_tuple} not found in MOUSEMAPS_ANNOTS")
    return matched


def _filter_annots_by_keys(keys_dict):
    filtered = []
    for annot in MOUSEMAPS_ANNOTS:
        for key in ["source", "desc", "space", "res", "format"]:
            value = keys_dict[key]
            if value is not None and annot[key] != value:
                break
        if keys_dict["tag"] is not None and keys_dict["tag"] not in annot["tags"]:
            break
        filtered.append(annot)
    return filtered


def _check_json(osfstorage_data):
    """
    Check for errors in meta.json.

    For internal use only.

    Returns
    -------
    None
    """
    # reload the datasets and meta json files
    from rich.console import Console

    console = Console()

    MOUSEMAPS_ATLASES = _load_resource_json("datasets/data/atlases.json")["atlases"]
    MOUSEMAPS_ANNOTS = _load_resource_json("datasets/data/annotations.json")[
        "annotations"
    ]
    MOUSEMAPS_ANNOTS_META = _load_resource_json("datasets/data/annotations-meta.json")[
        "annotations-meta"
    ]

    console.print("ATLASES")
    for atlas_k, atlas_v in MOUSEMAPS_ATLASES.items():
        console.print(f"{atlas_k} >")
        for file_k, file_v in atlas_v["files"].items():
            console.print(f"  {file_k} >")
            if file_v["checksum"] == osfstorage_data[file_v["fname"]]["md5"]:
                console.print("    [bold green]✓[/bold green] checksum")
            else:
                console.print(
                    f"    [bold red]x[/bold red] checksum local: {file_v['checksum']} remote: {osfstorage_data[file_v['fname']]['md5']}"
                )

            if file_v["url"]["osf"] == osfstorage_data[file_v["fname"]]["guid"]:
                console.print("    [bold green]✓[/bold green] url")
            else:
                console.print(
                    f"    [bold red]x[/bold red] url local: {file_v['url']['osf']} remote: {osfstorage_data[file_v['fname']]['guid']}"
                )

    console.print("\nANNOTS_META")
    for annot_meta in MOUSEMAPS_ANNOTS_META:
        console.print(f"{annot_meta['source']} {annot_meta['name']} >")
        console.print("  \[annot files] >")
        for file_v in annot_meta["files"]:
            console.print(f"    {'-'.join(file_v)} >")
            try:
                matched = _match_annots_by_tuple([tuple(file_v)])
            except ValueError:
                console.print("      [bold red]x[/bold red] json")

            if len(matched) == 1:
                console.print("      [bold green]✓[/bold green] json")
            else:
                console.print("      [bold red]x[/bold red] json")

        console.print("  \[aux files] >")
        for aux_k, aux_v in annot_meta["aux_files"].items():
            console.print(f"    {aux_k} >")

            if not isinstance(aux_v, list):
                aux_v = [aux_v]
            for file_v in aux_v:
                console.print(f"      {file_v['fname']} >")
                if file_v["fname"] not in osfstorage_data:
                    console.print(
                        f"        [bold red]x[/bold red] {file_v['fname']} not found in osfstorage"
                    )
                    continue
                if file_v["checksum"] == osfstorage_data[file_v["fname"]]["md5"]:
                    console.print("        [bold green]✓[/bold green] checksum")
                else:
                    console.print(
                        f"        [bold red]x[/bold red] checksum local: {file_v['checksum']} remote: {osfstorage_data[file_v['fname']]['md5']}"
                    )
                if file_v["url"]["osf"] == osfstorage_data[file_v["fname"]]["guid"]:
                    console.print("        [bold green]✓[/bold green] url")
                else:
                    console.print(
                        f"        [bold red]x[/bold red] url local: {file_v['url']['osf']} remote: {osfstorage_data[file_v['fname']]['guid']}"
                    )

    console.print("\nANNOTS")
    for annot in MOUSEMAPS_ANNOTS:
        annotstr = "-".join(
            [annot["source"], annot["desc"], annot["space"], annot["res"]]
        )
        console.print(f"  {annotstr} >")
        if annot["fname"] not in osfstorage_data:
            console.print(f"        [bold red]x[/bold red] {annot['fname']} not found in osfstorage")
            continue
        if annot["checksum"] == osfstorage_data[annot["fname"]]["md5"]:
            console.print("    [bold green]✓[/bold green] checksum")
        else:
            console.print(
                f"    [bold red]x[/bold red] checksum local: {annot['checksum']} remote: {osfstorage_data[annot['fname']]['md5']}"
            )
        if annot["url"]["osf"] == osfstorage_data[annot["fname"]]["guid"]:
            console.print("    [bold green]✓[/bold green] url")
        else:
            console.print(
                f"    [bold red]x[/bold red] url local: {annot['url']['osf']} remote: {osfstorage_data[annot['fname']]['guid']}"
            )


def _check_osfstorage():
    """
    Check for errors in OSF links.

    For internal use only.

    Returns
    -------
    None
    """
    # reload the datasets and meta json files
    import requests
    from rich.console import Console

    console = Console()

    osfstorage_data = {}

    OSF_NODEID = "uryk3"
    OSF_URL = f"https://api.osf.io/v2/nodes/{OSF_NODEID}/files/osfstorage/"

    def _get_file_href(d):
        return d["relationships"]["files"]["links"]["related"]["href"]

    def _get_full_data(url):
        # handles pagination
        resp = requests.get(url).json()
        ret = resp["data"]
        while resp["links"].get("next"):
            href = resp["links"]["next"]
            resp = requests.get(href).json()
            ret.extend(resp["data"])
        return ret

    for kind in requests.get(OSF_URL).json()["data"]:
        kind_path = kind["attributes"]["materialized_path"]
        console.print(f"{kind_path} >")
        if kind_path == "/atlases/":
            for source in _get_full_data(_get_file_href(kind)):
                source_path = source["attributes"]["materialized_path"]
                console.print(f"  {source_path.removeprefix(kind_path)} >")
                for version in _get_full_data(_get_file_href(source)):
                    version_path = version["attributes"]["materialized_path"]
                    console.print(f"  {version_path.removeprefix(source_path)} >")
                    for file in _get_full_data(_get_file_href(version)):
                        file_path = file["attributes"]["materialized_path"]
                        console.print(f"    {file_path.removeprefix(version_path)} >")
                        console.print(f"      {file['attributes']['guid']}")
                        console.print(
                            f"      {file['attributes']['extra']['hashes']['md5']}"
                        )
                        if not file["attributes"]["guid"]:
                            requests.get(f'https://osf.io/{OSF_NODEID}/files/osfstorage{file["attributes"]["path"]}')
                        osfstorage_data[file["attributes"]["name"]] = {
                            "guid": file["attributes"]["guid"],
                            "md5": file["attributes"]["extra"]["hashes"]["md5"],
                        }
        elif kind_path == "/annotations/":
            for source in _get_full_data(_get_file_href(kind)):
                source_path = source["attributes"]["materialized_path"]
                console.print(f"  {source_path.removeprefix(kind_path)} >")
                for file in _get_full_data(_get_file_href(source)):
                    file_path = file["attributes"]["materialized_path"]
                    console.print(f"    {file_path.removeprefix(source_path)} >")
                    console.print(f"      {file['attributes']['guid']}")
                    console.print(
                        f"      {file['attributes']['extra']['hashes']['md5']}"
                    )
                    if not file["attributes"]["guid"]:
                        requests.get(f'https://osf.io/{OSF_NODEID}/files/osfstorage{file["attributes"]["path"]}')
                    osfstorage_data[file["attributes"]["name"]] = {
                        "guid": file["attributes"]["guid"],
                        "md5": file["attributes"]["extra"]["hashes"]["md5"],
                    }
        else:
            raise ValueError(f"Unknown kind_path={kind_path}")
    return osfstorage_data


# def _fill_meta_json_refs(bib_file, json_file, overwrite=False, use_defaults=False):
#     """
#     Fill in citation information for references in a JSON file.

#     For internal use only.

#     Parameters
#     ----------
#     bib_file : str
#         Path to BibTeX file containing references
#     json_file : str
#         Path to JSON file containing references
#     overwrite : bool, optional
#         Whether to overwrite existing citation information. Default: False
#     use_defaults : bool, optional
#         Whether to use default paths for `bib_file` and `json_file`. Default: False

#     Returns
#     -------
#     None
#     """
#     if use_defaults:
#         bib_file = \
#             importlib.resources.files("neuromaps") / "datasets/data/neuromaps.bib"
#         json_file = \
#             importlib.resources.files("neuromaps") / "datasets/data/meta.json"

#     from pybtex import PybtexEngine
#     engine = PybtexEngine()

#     def _get_citation(key):
#         s = engine.format_from_file(
#             filename=bib_file, style="unsrt",
#             citations=[key], output_backend="plaintext"
#             )
#         return s.strip("\n").replace("[1] ", "")

#     with open(json_file) as src:
#         nm_meta = json.load(src)

#     for entry in nm_meta["annotations"]:
#         for bib_category in ["primary", "secondary"]:
#             for bib_item in entry["refs"][bib_category]:
#                 if bib_item["bibkey"] not in ["", None]:
#                     if bib_item["citation"] == "" or overwrite:
#                         bib_item["citation"] = _get_citation(bib_item["bibkey"])

#     with open(json_file, "w") as dst:
#         json.dump(nm_meta, dst, indent=4)


def _gen_doc_listofmaps_rst(listofmaps_file):
    """
    Generate a list of maps in reStructuredText format.

    For internal use only.

    Parameters
    ----------
    listofmaps_file : str
        Path to write the list of maps

    Returns
    -------
    None
    """
    output = []

    output += [
        ".. _listofmaps:",
        "",
        "------------",
        "List of Maps",
        "------------",
        "This is a complete list of maps available in the `neuromaps_mouse` package. ",
        "\n----\n",
    ]

    for annot_meta in MOUSEMAPS_ANNOTS_META:
        title = f"{annot_meta['name']} ({annot_meta['source']})"
        output += [
            title,
            "=" * len(title),
            "",
            "**Full description**",
            "",
            f"{annot_meta['description']}",
            "",
        ]

        for file in annot_meta["files"]:
            curr_annot = _match_annots_by_tuple(tuple(file))[0]
            file_title = "-".join(file)
            key_str = ", ".join(
                [f"{k}='{curr_annot[k]}'" for k in ["source", "desc", "space", "res"]]
            )
            output += [
                file_title,
                "-" * len(file_title),
                "",
                f"**Description**: {annot_meta['file_desc'][curr_annot['desc']]}",
                "",
                f"**Format**: {curr_annot['format']}",
                "",
                "**How to use**",
                "",
                ".. code:: python",
                "",
                "    # get annotation",
                f"    fetch_annotation({key_str})",
                "",
                "    # file location",
                f"    # $MOUSEMAPS_DATA/{curr_annot['rel_path']}",
                "",
                "    # file name",
                f"    # {curr_annot['fname']}",
                "",
                "    # region mapping file",
                f"    # {curr_annot['regionmapping']}",
                "",
            ]

        output.append("**References**")
        for bib_item in annot_meta["refs"]:
            if bib_item["bibkey"] not in ["", None]:
                output += [f"    - {bib_item['citation']}"]

        output.append("\n----\n")

    output = output[:-1]

    with open(listofmaps_file, "w") as dst:
        dst.write("\n".join(output))