Files
EasyFace/modelscope/msdatasets/data_loader/data_loader.py
2023-03-02 11:17:26 +08:00

167 lines
6.4 KiB
Python

# Copyright (c) Alibaba, Inc. and its affiliates.
from abc import ABC, abstractmethod
from typing import Optional, Union
from datasets import (Dataset, DatasetBuilder, DatasetDict, IterableDataset,
IterableDatasetDict)
from datasets import load_dataset as hf_data_loader
from modelscope.hub.api import ModelScopeConfig
from modelscope.msdatasets.auth.auth_config import OssAuthConfig
from modelscope.msdatasets.context.dataset_context_config import \
DatasetContextConfig
from modelscope.msdatasets.data_files.data_files_manager import \
DataFilesManager
from modelscope.msdatasets.meta.data_meta_manager import DataMetaManager
from modelscope.utils.constant import DatasetFormations
class BaseDataLoader(ABC):
"""Base dataset loader to load data."""
def __init__(self, dataset_context_config: DatasetContextConfig):
self.dataset_context_config = dataset_context_config
@abstractmethod
def process(self):
"""The entity processing pipeline for fetching the data. """
raise NotImplementedError(
f'No default implementation provided for {BaseDataLoader.__name__}.process.'
)
@abstractmethod
def _authorize(self):
raise NotImplementedError(
f'No default implementation provided for {BaseDataLoader.__name__}._authorize.'
)
@abstractmethod
def _build(self):
raise NotImplementedError(
f'No default implementation provided for {BaseDataLoader.__name__}._build.'
)
@abstractmethod
def _prepare_and_download(self):
raise NotImplementedError(
f'No default implementation provided for {BaseDataLoader.__name__}._prepare_and_download.'
)
@abstractmethod
def _post_process(self):
raise NotImplementedError(
f'No default implementation provided for {BaseDataLoader.__name__}._post_process.'
)
class OssDataLoader(BaseDataLoader):
def __init__(self, dataset_context_config: DatasetContextConfig):
super().__init__(dataset_context_config)
self.data_files_builder: Optional[DataFilesManager] = None
self.dataset: Optional[Union[Dataset, IterableDataset, DatasetDict,
IterableDatasetDict]] = None
self.builder: Optional[DatasetBuilder] = None
self.data_files_manager: Optional[DataFilesManager] = None
def process(self) -> None:
""" Sequential data fetching process: authorize -> build -> prepare_and_download -> post_process,
to keep dataset_context_config updated. """
self._authorize()
self._build()
self._prepare_and_download()
self._post_process()
def _authorize(self) -> None:
""" Authorization of target dataset.
Get credentials from cache and send to the modelscope-hub in the future. """
# TODO: obtain credentials from loacl cache when available.
cookies = ModelScopeConfig.get_cookies()
git_token = ModelScopeConfig.get_token()
user_info = ModelScopeConfig.get_user_info()
if not self.dataset_context_config.auth_config:
auth_config = OssAuthConfig(cookies=cookies,
git_token=git_token,
user_info=user_info)
else:
auth_config = self.dataset_context_config.auth_config
auth_config.cookies = cookies
auth_config.git_token = git_token
auth_config.user_info = user_info
self.dataset_context_config.auth_config = auth_config
def _build(self) -> None:
""" Sequential data files building process: build_meta -> build_data_files , to keep context_config updated. """
# Build meta data
meta_manager = DataMetaManager(self.dataset_context_config)
meta_manager.fetch_meta_files()
meta_manager.parse_dataset_structure()
self.dataset_context_config = meta_manager.dataset_context_config
# Build data-files manager
self.data_files_manager = DataFilesManager(
dataset_context_config=self.dataset_context_config)
self.builder = self.data_files_manager.get_data_files_builder()
def _prepare_and_download(self) -> None:
""" Fetch data-files from modelscope dataset-hub. """
dataset_py_script = self.dataset_context_config.data_meta_config.dataset_py_script
dataset_formation = self.dataset_context_config.data_meta_config.dataset_formation
dataset_name = self.dataset_context_config.dataset_name
subset_name = self.dataset_context_config.subset_name
version = self.dataset_context_config.version
split = self.dataset_context_config.split
data_dir = self.dataset_context_config.data_dir
data_files = self.dataset_context_config.data_files
cache_dir = self.dataset_context_config.cache_root_dir
download_mode = self.dataset_context_config.download_mode
input_kwargs = self.dataset_context_config.config_kwargs
if self.builder is None and not dataset_py_script:
raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
if dataset_py_script and dataset_formation == DatasetFormations.hf_compatible:
self.dataset = hf_data_loader(dataset_py_script,
name=subset_name,
revision=version,
split=split,
data_dir=data_dir,
data_files=data_files,
cache_dir=cache_dir,
download_mode=download_mode.value,
ignore_verifications=True,
**input_kwargs)
else:
self.dataset = self.data_files_manager.fetch_data_files(
self.builder)
def _post_process(self) -> None:
...
class MaxComputeDataLoader(BaseDataLoader):
"""Data loader for MaxCompute data source."""
# TODO: MaxCompute data source to be supported .
def __init__(self, dataset_context_config: DatasetContextConfig):
super().__init__(dataset_context_config)
self.dataset = None
def process(self):
...
def _authorize(self):
...
def _build(self):
...
def _prepare_and_download(self):
...
def _post_process(self):
...