Skip to content

Data downloading utility

download_data.py

download_data(data_dir, stories, subjects, figures)

Downloads Lebel et al (2023) preprocessed data via Datalad API from the Openneuro source repository (https://github.com/OpenNeuroDatasets/ds003020.git) into .

Parameters:

Name Type Description Default

data_dir

Optional[str]

The directory to store the downloaded data to. If not provided, defaults to "ds003020".

required

stories

str

The stories to download ('all') or it defaults to 3 stories ("souls", "alternateithicatom", "avatar").

required

subjects

Union[str, list[str]]

The subject datasets to download. Can be a single subject (e.g. 'UTS02') or a list of subjects or a string 'all' to download data for all subjects.

required

figures

bool

Whether to only download the data required to reproduce the figures.

required

Returns:

Type Description
None
Source code in src/encoders/download_data.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def download_data(
    data_dir: Optional[str],
    stories: str,
    subjects: Union[str, list[str]],
    figures: bool,
):
    """Downloads Lebel et al (2023) preprocessed data via Datalad API
    from the Openneuro source repository (https://github.com/OpenNeuroDatasets/ds003020.git)
    into <data_dir>.

    Parameters
    ----------
    data_dir: str, optional
        The directory to store the downloaded data to.
        If not provided, defaults to "ds003020".
    stories: str
        The stories to download ('all') or it defaults to 3 stories
        ("souls", "alternateithicatom", "avatar").
    subjects: Union[str, list[str]]
        The subject datasets to download. Can be a single subject (e.g. 'UTS02') or
        a list of subjects or a string 'all' to download data for all subjects.
    figures: bool
        Whether to only download the data required to reproduce the figures.

    Returns
    -------
    None

    """

    # 0. parameters
    if data_dir is None:
        if Path("config.yaml").exists():
            data_dir = cast(str, load_config()["DATA_DIR"])
        else:
            data_dir = "ds003020"

    if not isinstance(subjects, list):
        subjects = [subjects]

    if "all" in subjects:
        subjects = [
            "UTS01",
            "UTS02",
            "UTS03",
            "UTS04",
            "UTS05",
            "UTS06",
            "UTS07",
            "UTS08",
        ]

    # 1. Clone
    if Path(data_dir).exists():
        log.info(f"{data_dir} already exists. Skipping cloning.")
    else:
        clone(source=DATASET_URL, path=data_dir)

    if figures:
        # Only download data for figures
        log.info("Downloading data from OpenNeuro required to reproduce figures")

        for subject in ["UTS01", "UTS02", "UTS03"]:
            get(
                dataset=data_dir,
                path=Path(data_dir, f"derivative/pycortex-db/{subject}"),
            )

    else:
        # Download specified neuro data
        if "all" in stories:
            log.info("Downloading all data, this can take a while.")
            get(
                dataset=data_dir,
                path=Path(data_dir, "derivative/english1000sm.hf5"),
            )
            for subject in subjects:
                get(
                    dataset=data_dir,
                    path=Path(data_dir, "derivative/TextGrids"),
                )
                get(
                    dataset=data_dir,
                    path=Path(data_dir, "stimuli"),
                )
                get(
                    dataset=data_dir,
                    path=Path(data_dir, f"derivative/pycortex-db/{subject}"),
                )
                get(
                    dataset=data_dir,
                    path=Path(data_dir, f"derivative/preprocessed_data/{subject}"),
                )
        else:
            log.info("Downloading three stories")
            story_names = ["souls", "alternateithicatom", "avatar"]

            get(
                dataset=data_dir,
                path=Path(data_dir, "derivative/english1000sm.hf5"),
            )

            for story_name in story_names:
                for subject in subjects:
                    get(
                        dataset=data_dir,
                        path=Path(
                            data_dir, f"derivative/TextGrids/{story_name}.TextGrid"
                        ),
                    )
                    get(
                        dataset=data_dir,
                        path=Path(data_dir, f"stimuli/{story_name}.wav"),
                    )
                    get(
                        dataset=data_dir,
                        path=Path(data_dir, f"derivative/pycortex-db/{subject}"),
                    )
                    get(
                        dataset=data_dir,
                        path=Path(
                            data_dir,
                            f"derivative/preprocessed_data/{subject}/{story_name}.hf5",
                        ),
                    )

    # After download update the config data_dir
    if not Path("config.yaml").exists():
        shutil.copy("config.example.yaml", "config.yaml")
        log.info("Created new config file")

    config = load_config()
    config["DATA_DIR"] = data_dir

    with open("config.yaml", "w") as f_out:
        yaml.dump(config, f_out)
    log.info(f"Updated config.yaml to DATA_DIR={data_dir}")