# Authors: The MNE-Python contributors.
# License: BSD-3-Clause
# Copyright the MNE-Python contributors.
import inspect
import os
import os.path as op
import numpy as np
from ...utils import _check_pandas_installed, _on_missing, _TempDir, verbose
from ..utils import _downloader_params, _get_path
AGE_SLEEP_RECORDS = op.join(op.dirname(__file__), "age_records.csv")
TEMAZEPAM_SLEEP_RECORDS = op.join(op.dirname(__file__), "temazepam_records.csv")
TEMAZEPAM_RECORDS_URL = (
"https://physionet.org/physiobank/database/sleep-edfx/ST-subjects.xls" # noqa: E501
)
TEMAZEPAM_RECORDS_URL_SHA1 = "f52fffe5c18826a2bd4c5d5cb375bb4a9008c885"
AGE_RECORDS_URL = "https://physionet.org/physiobank/database/sleep-edfx/SC-subjects.xls"
AGE_RECORDS_URL_SHA1 = "0ba6650892c5d33a8e2b3f62ce1cc9f30438c54f"
sha1sums_fname = op.join(op.dirname(__file__), "SHA1SUMS")
def _fetch_one(fname, hashsum, path, force_update, base_url):
import pooch
# Fetch the file
url = base_url + "/" + fname
destination = op.join(path, fname)
if op.isfile(destination) and not force_update:
return destination, False
if op.isfile(destination):
os.remove(destination)
if not op.isdir(op.dirname(destination)):
os.makedirs(op.dirname(destination))
downloader = pooch.HTTPDownloader(**_downloader_params())
pooch.retrieve(
url=url,
known_hash=f"sha1:{hashsum}",
path=path,
downloader=downloader,
fname=fname,
)
return destination, True
@verbose
def _data_path(path=None, verbose=None):
"""Get path to local copy of EEG Physionet age Polysomnography dataset URL.
This is a low-level function useful for getting a local copy of a
remote Polysomnography dataset :footcite:`KempEtAl2000` which is available
at PhysioNet :footcite:`GoldbergerEtAl2000`.
Parameters
----------
path : None | str
Location of where to look for the data storing location.
If None, the environment variable or config parameter
``PHYSIONET_SLEEP_PATH`` is used. If it doesn't exist, the "~/mne_data"
directory is used. If the dataset is not found under the given path,
the data will be automatically downloaded to the specified folder.
%(verbose)s
Returns
-------
path : list of Path
Local path to the given data file. This path is contained inside a list
of length one, for compatibility.
References
----------
.. footbibliography::
""" # noqa: E501
key = "PHYSIONET_SLEEP_PATH"
name = "PHYSIONET_SLEEP"
path = _get_path(path, key, name)
return op.join(path, "physionet-sleep-data")
def _update_sleep_temazepam_records(fname=TEMAZEPAM_SLEEP_RECORDS):
"""Help function to download Physionet's temazepam dataset records."""
import pooch
pd = _check_pandas_installed()
tmp = _TempDir()
# Download subjects info.
subjects_fname = op.join(tmp, "ST-subjects.xls")
downloader = pooch.HTTPDownloader(**_downloader_params())
pooch.retrieve(
url=TEMAZEPAM_RECORDS_URL,
known_hash=f"sha1:{TEMAZEPAM_RECORDS_URL_SHA1}",
path=tmp,
downloader=downloader,
fname=op.basename(subjects_fname),
)
# Load and Massage the checksums.
sha1_df = pd.read_csv(
sha1sums_fname, sep=" ", header=None, names=["sha", "fname"], engine="python"
)
select_age_records = sha1_df.fname.str.startswith(
"ST"
) & sha1_df.fname.str.endswith("edf")
sha1_df = sha1_df[select_age_records]
sha1_df["id"] = [name[:6] for name in sha1_df.fname]
# Load and massage the data.
data = pd.read_excel(subjects_fname, header=[0, 1])
data = data.set_index(("Subject - age - sex", "Nr"))
data.index.name = "subject"
data.columns.names = [None, None]
kwargs = dict()
# TODO VERSION can be removed once we require Pandas 2.1
if "future_stack" in inspect.getfullargspec(pd.DataFrame.stack).args:
kwargs["future_stack"] = True
data = (
data.set_index(
[("Subject - age - sex", "Age"), ("Subject - age - sex", "M1/F2")],
append=True,
)
.stack(level=0, **kwargs)
.reset_index()
)
data = data.rename(
columns={
("Subject - age - sex", "Age"): "age",
("Subject - age - sex", "M1/F2"): "sex",
"level_3": "drug",
}
)
data["id"] = [f"ST7{s:02d}{n:1d}" for s, n in zip(data.subject, data["night nr"])]
data = pd.merge(sha1_df, data, how="outer", on="id")
data["record type"] = (
data.fname.str.split("-", expand=True)[1]
.str.split(".", expand=True)[0]
.astype("category")
)
data = data.set_index(
["id", "subject", "age", "sex", "drug", "lights off", "night nr", "record type"]
).unstack()
data.columns = [l1 + "_" + l2 for l1, l2 in data.columns]
data = data.reset_index().drop(columns=["id"])
data["sex"] = data.sex.astype("category").cat.rename_categories(
{1: "male", 2: "female"}
)
data["drug"] = data["drug"].str.split(expand=True)[0]
data["subject_orig"] = data["subject"]
data["subject"] = data.index // 2 # to make sure index is from 0 to 21
# Save the data.
data.to_csv(fname, index=False)
def _update_sleep_age_records(fname=AGE_SLEEP_RECORDS):
"""Help function to download Physionet's age dataset records."""
import pooch
pd = _check_pandas_installed()
tmp = _TempDir()
# Download subjects info.
subjects_fname = op.join(tmp, "SC-subjects.xls")
downloader = pooch.HTTPDownloader(**_downloader_params())
pooch.retrieve(
url=AGE_RECORDS_URL,
known_hash=f"sha1:{AGE_RECORDS_URL_SHA1}",
path=tmp,
downloader=downloader,
fname=op.basename(subjects_fname),
)
# Load and Massage the checksums.
sha1_df = pd.read_csv(
sha1sums_fname, sep=" ", header=None, names=["sha", "fname"], engine="python"
)
select_age_records = sha1_df.fname.str.startswith(
"SC"
) & sha1_df.fname.str.endswith("edf")
sha1_df = sha1_df[select_age_records]
sha1_df["id"] = [name[:6] for name in sha1_df.fname]
# Load and massage the data.
data = pd.read_excel(subjects_fname)
data = data.rename(
index=str, columns={"sex (F=1)": "sex", "LightsOff": "lights off"}
)
data["sex"] = data.sex.astype("category").cat.rename_categories(
{1: "female", 2: "male"}
)
data["id"] = [f"SC4{s:02d}{n:1d}" for s, n in zip(data.subject, data.night)]
data = data.set_index("id").join(sha1_df.set_index("id")).dropna()
data["record type"] = (
data.fname.str.split("-", expand=True)[1]
.str.split(".", expand=True)[0]
.astype("category")
)
data = data.reset_index().drop(columns=["id"])
data = data[
["subject", "night", "record type", "age", "sex", "lights off", "sha", "fname"]
]
# Save the data.
data.to_csv(fname, index=False)
def _check_subjects(subjects, n_subjects, missing=None, on_missing="raise"):
"""Check whether subjects are available.
Parameters
----------
subjects : list
Subject numbers to be checked.
n_subjects : int
Number of subjects available.
missing : list | None
Subject numbers that are missing.
on_missing : 'raise' | 'warn' | 'ignore'
What to do if one or several subjects are not available. Valid keys
are 'raise' | 'warn' | 'ignore'. Default is 'error'. If on_missing
is 'warn' it will proceed but warn, if 'ignore' it will proceed
silently.
"""
valid_subjects = np.arange(n_subjects)
if missing is not None:
valid_subjects = np.setdiff1d(valid_subjects, missing)
unknown_subjects = np.setdiff1d(subjects, valid_subjects)
if unknown_subjects.size > 0:
subjects_list = ", ".join([str(s) for s in unknown_subjects])
msg = (
f"This dataset contains subjects 0 to {n_subjects - 1} with "
f"missing subjects {missing}. Unknown subjects: "
f"{subjects_list}."
)
_on_missing(on_missing, msg)