Source code for neuromaps_mouse.datasets.utils

"""Functions for working with datasets."""

import os
import json
import shutil
import importlib.resources
from pathlib import Path

try:
    # nilearn 0.10.3
    from nilearn.datasets._utils import fetch_single_file as _fetch_file, _md5_sum_file
except ImportError:
    from nilearn.datasets.utils import _fetch_file, _md5_sum_file


[docs] def get_data_dir(data_dir=None): """Get the data directory.""" if data_dir is None: data_dir = os.environ.get( "MOUSEMAPS_DATA", str(Path.home() / "neuromaps-mouse-data") ) data_dir = Path(data_dir).expanduser() data_dir.mkdir(parents=True, exist_ok=True) return data_dir
def _load_resource_json(relative_path): """ Load JSON file from package resources. Parameters ---------- relative_path : str Path to JSON file relative to package resources Returns ------- resource_json : dict JSON file loaded as a dictionary """ # handling pkg_resources.resource_filename deprecation if getattr(importlib.resources, "files", None) is not None: f_resource = importlib.resources.files("neuromaps_mouse") / relative_path else: from pkg_resources import resource_filename f_resource = resource_filename("neuromaps_mouse", relative_path) with open(f_resource) as src: resource_json = json.load(src) return resource_json MOUSEMAPS_ATLASES = _load_resource_json("datasets/data/atlases.json")["atlases"] MOUSEMAPS_ANNOTS = _load_resource_json("datasets/data/annotations.json")["annotations"] MOUSEMAPS_ANNOTS_META = _load_resource_json("datasets/data/annotations-meta.json")[ "annotations-meta" ] def _osfify_url(osf_file_id): return f"https://osf.io/download/{osf_file_id}/" def fetch_files(annotations, file_type="annotations", data_dir=None, verbose=1): """Fetch files from OSF.""" targ_fname_list = [] for annot in annotations: if file_type in ["annotations", "annotations-meta"]: targ_path = Path(data_dir) / "annotations" elif file_type == "atlases": targ_path = Path(data_dir) / "atlases" else: raise ValueError(f"Unknown file_type={file_type}") targ_fname = targ_path / annot["rel_path"] / annot["fname"] if targ_fname.exists(): if _md5_sum_file(targ_fname) == annot["checksum"]: targ_fname_list.append(targ_fname) if verbose: print(f"Found {targ_fname.name} at {targ_fname}") continue else: if verbose: print(f"Checksum mismatch for {targ_fname.name}, redownloading...") dl_fname = _fetch_file( _osfify_url(annot["url"]["osf"]), targ_fname.parent, resume=True, md5sum=annot["checksum"], verbose=1, ) shutil.move(dl_fname, targ_fname) targ_fname_list.append(targ_fname) if verbose: print(f"Downloaded {targ_fname.name} to {targ_fname}") return targ_fname_list def _annot_full_to_tuple(full_list): return [ tuple([annot[key] for key in ["source", "desc", "space", "res"]]) for annot in full_list ] def _match_annots_by_tuple(annot_tuple_list): # match all then sort if not isinstance(annot_tuple_list, list): annot_tuple_list = [annot_tuple_list] matched = [] for annot_tuple in annot_tuple_list: found = False for annot in MOUSEMAPS_ANNOTS: curr_annot_tuple = tuple( [annot[key] for key in ["source", "desc", "space", "res"]] ) if curr_annot_tuple == annot_tuple: matched.append(annot) found = True break if not found: raise ValueError(f"Annotation {annot_tuple} not found in MOUSEMAPS_ANNOTS") return matched def _filter_annots_by_keys(keys_dict): filtered = [] for annot in MOUSEMAPS_ANNOTS: for key in ["source", "desc", "space", "res", "format"]: value = keys_dict[key] if value is not None and annot[key] != value: break if keys_dict["tag"] is not None and keys_dict["tag"] not in annot["tags"]: break filtered.append(annot) return filtered def _check_osfstorage(): """ Check for errors in OSF links. For internal use only. osfstorage_data = _check_osfstorage() Returns ------- None """ # reload the datasets and meta json files import requests from rich.console import Console console = Console() osfstorage_data = {} OSF_NODEID = "uryk3" OSF_URL = f"https://api.osf.io/v2/nodes/{OSF_NODEID}/files/osfstorage/" def _get_file_href(entry): return entry["relationships"]["files"]["links"]["related"]["href"] def _get_paginated(url): resp = requests.get(url).json() results = resp["data"] while resp["links"].get("next"): resp = requests.get(resp["links"]["next"]).json() results.extend(resp["data"]) return results def _process_file(file_entry): attrs = file_entry["attributes"] console.print(f" {attrs['guid']}") console.print(f" {attrs['extra']['hashes']['md5']}") if not attrs["guid"]: r = requests.get( file_entry["links"]["self"] + "?create_guid=true", allow_redirects=True, ) console.print(f" {r.url}") osfstorage_data[attrs["name"]] = { "guid": attrs["guid"], "md5": attrs["extra"]["hashes"]["md5"], } for kind in requests.get(OSF_URL).json()["data"]: kind_path = kind["attributes"]["materialized_path"] console.print(f"{kind_path} >") if kind_path == "/atlases/": for source in _get_paginated(_get_file_href(kind)): source_path = source["attributes"]["materialized_path"] console.print(f" {source_path.removeprefix(kind_path)} >") for version in _get_paginated(_get_file_href(source)): version_path = version["attributes"]["materialized_path"] console.print(f" {version_path.removeprefix(source_path)} >") for file_entry in _get_paginated(_get_file_href(version)): fpath = file_entry["attributes"]["materialized_path"] console.print(f" {fpath.removeprefix(version_path)} >") _process_file(file_entry) elif kind_path == "/annotations/": for source in _get_paginated(_get_file_href(kind)): source_path = source["attributes"]["materialized_path"] console.print(f" {source_path.removeprefix(kind_path)} >") for file_entry in _get_paginated(_get_file_href(source)): fpath = file_entry["attributes"]["materialized_path"] console.print(f" {fpath.removeprefix(source_path)} >") _process_file(file_entry) else: raise ValueError(f"Unknown kind_path={kind_path}") return osfstorage_data def _write_resource_json(relative_path, data): """Write data dict to a package resource JSON file.""" if getattr(importlib.resources, "files", None) is not None: filepath = importlib.resources.files("neuromaps_mouse") / relative_path else: from pkg_resources import resource_filename filepath = resource_filename("neuromaps_mouse", relative_path) with open(filepath, "w") as f: json.dump(data, f, indent=2) def _validate_file_fields(file_entry, osfstorage_data, overwrite, console, indent=" "): """Validate checksum and URL of a file entry against OSF storage data. Returns True if any field was updated, None if file not in osfstorage_data. """ fname = file_entry["fname"] if fname not in osfstorage_data: console.print(f"{indent}[bold red]x[/bold red] {fname} not found in osfstorage") return None updated = False remote = osfstorage_data[fname] if file_entry["checksum"] == remote["md5"]: console.print(f"{indent}[bold green]✓[/bold green] checksum") elif overwrite: file_entry["checksum"] = remote["md5"] updated = True console.print(f"{indent}[bold yellow]↻[/bold yellow] checksum updated") else: console.print( f"{indent}[bold red]x[/bold red] " f"checksum local: {file_entry['checksum']} remote: {remote['md5']}" ) if file_entry["url"]["osf"] == remote["guid"]: console.print(f"{indent}[bold green]✓[/bold green] url") elif overwrite: file_entry["url"]["osf"] = remote["guid"] updated = True console.print(f"{indent}[bold yellow]↻[/bold yellow] url updated") else: console.print( f"{indent}[bold red]x[/bold red] " f"url local: {file_entry['url']['osf']} remote: {remote['guid']}" ) return updated def _check_json(osfstorage_data, overwrite=False): """ Check for errors in meta.json. For internal use only. _check_json(osfstorage_data) Returns ------- None """ from rich.console import Console console = Console() MOUSEMAPS_ATLASES = _load_resource_json("datasets/data/atlases.json")["atlases"] MOUSEMAPS_ANNOTS = _load_resource_json("datasets/data/annotations.json")[ "annotations" ] MOUSEMAPS_ANNOTS_META = _load_resource_json("datasets/data/annotations-meta.json")[ "annotations-meta" ] atlases_updated = False annots_updated = False annots_meta_updated = False console.print("ATLASES") for atlas_k, atlas_v in MOUSEMAPS_ATLASES.items(): console.print(f"{atlas_k} >") for file_k, file_v in atlas_v["files"].items(): console.print(f" {file_k} >") if _validate_file_fields(file_v, osfstorage_data, overwrite, console, " "): atlases_updated = True console.print("\nANNOTS_META") for annot_meta in MOUSEMAPS_ANNOTS_META: console.print(f"{annot_meta['source']} {annot_meta['name']} >") console.print(r" \[annot files] >") for file_v in annot_meta["files"]: console.print(f" {'-'.join(file_v)} >") try: matched = _match_annots_by_tuple([tuple(file_v)]) except ValueError: console.print(" [bold red]x[/bold red] json not found") else: if len(matched) == 1: console.print(" [bold green]✓[/bold green] json") else: console.print(f" [bold red]x[/bold red] json {len(matched) = }") console.print(r" \[aux files] >") for aux_k, aux_v in annot_meta["aux_files"].items(): console.print(f" {aux_k} >") if not isinstance(aux_v, list): aux_v = [aux_v] for file_v in aux_v: console.print(f" {file_v['fname']} >") if _validate_file_fields( file_v, osfstorage_data, overwrite, console, " " ): annots_meta_updated = True console.print("\nANNOTS") for annot in MOUSEMAPS_ANNOTS: annotstr = "-".join( [annot["source"], annot["desc"], annot["space"], annot["res"]] ) console.print(f" {annotstr} >") if _validate_file_fields(annot, osfstorage_data, overwrite, console, " "): annots_updated = True if atlases_updated: _write_resource_json("datasets/data/atlases.json", {"atlases": MOUSEMAPS_ATLASES}) console.print("\n[bold green]✓[/bold green] Updated atlases.json") if annots_updated: _write_resource_json( "datasets/data/annotations.json", {"annotations": MOUSEMAPS_ANNOTS} ) console.print("[bold green]✓[/bold green] Updated annotations.json") if annots_meta_updated: _write_resource_json( "datasets/data/annotations-meta.json", {"annotations-meta": MOUSEMAPS_ANNOTS_META}, ) console.print("[bold green]✓[/bold green] Updated annotations-meta.json") # Check for unreferenced files console.print("\nFILES IN OSF STORAGE NOT REFERENCED IN JSON") json_files = set() for atlas_v in MOUSEMAPS_ATLASES.values(): for file_v in atlas_v["files"].values(): json_files.add(file_v["fname"]) for annot in MOUSEMAPS_ANNOTS: json_files.add(annot["fname"]) for annot_meta in MOUSEMAPS_ANNOTS_META: for aux_v in annot_meta["aux_files"].values(): if not isinstance(aux_v, list): aux_v = [aux_v] for file_v in aux_v: json_files.add(file_v["fname"]) unreferenced_files = set(osfstorage_data.keys()) - json_files if unreferenced_files: for fname in sorted(unreferenced_files): console.print(f" [bold yellow]?[/bold yellow] {fname}") console.print( f"\n[bold yellow]Found {len(unreferenced_files)} " f"unreferenced files in OSF storage[/bold yellow]" ) else: console.print( " [bold green]✓[/bold green] All OSF storage files are referenced in JSON" ) def _gen_doc_listofmaps_rst(listofmaps_file): """ Generate a list of maps in reStructuredText format. For internal use only. """ MOUSEMAPS_ANNOTS_META = _load_resource_json("datasets/data/annotations-meta.json")[ "annotations-meta" ] sections = [] for annot_meta in MOUSEMAPS_ANNOTS_META: lines = [] title = f"{annot_meta['name']} ({annot_meta['source']})" lines += [ title, "=" * len(title), "", annot_meta["description"], "", ] if annot_meta.get("warning"): lines += [ f".. warning:: {annot_meta['warning']}", "", ] # File list as a table: description, format, fetch key lines += [ "**Available files**", "", ".. list-table::", " :header-rows: 1", "", " * - Description", " - Format", " - Key", ] for file_tuple in annot_meta["files"]: curr_annot = _match_annots_by_tuple(tuple(file_tuple))[0] desc = annot_meta["file_desc"][curr_annot["desc"]] fmt = curr_annot["format"] key_str = ", ".join( f"'{curr_annot[k]}'" for k in ("source", "desc", "space", "res") ) lines += [ f" * - {desc}", f" - {fmt}", f" - ``({key_str})``", ] lines.append("") # Unified how-to-use section first_annot = _match_annots_by_tuple(tuple(annot_meta["files"][0]))[0] first_key = ", ".join( f"'{first_annot[k]}'" for k in ("source", "desc", "space", "res") ) lines += [ "**How to use**", "", ".. code:: python", "", " # fetch a specific annotation", f" fetch_annotation(({first_key}))", "", " # file location", f" # $MOUSEMAPS_DATA/{first_annot['rel_path']}", "", ] # List all region mapping files from aux_files aux = annot_meta.get("aux_files", {}) regionmaps = aux.get("regionmapping", []) if not isinstance(regionmaps, list): regionmaps = [regionmaps] if regionmaps: lines.append(" # region mapping files") for rm in regionmaps: lines.append(f" # {rm['fname']}") lines.append("") # List all feature mapping files from aux_files featmaps = aux.get("featuremapping", []) if not isinstance(featmaps, list): featmaps = [featmaps] if featmaps: lines.append(" # feature mapping files") for fm in featmaps: lines.append(f" # {fm['fname']}") lines.append("") valid_refs = [ ref for ref in annot_meta["refs"] if ref.get("bibkey") not in ("", None) ] if valid_refs: lines += [ "**References**", "", ] for ref in valid_refs: lines.append(f" - {ref['citation']}") sections.append("\n".join(lines)) header = "\n".join([ ".. _listofmaps:", "", "------------", "List of Maps", "------------", "This is a complete list of maps available in the `neuromaps_mouse` package.", ]) separator = "\n\n----\n\n" with open(listofmaps_file, "w") as dst: dst.write(header + separator + separator.join(sections) + "\n")