mirror of
https://gitcode.com/gh_mirrors/eas/EasyFace.git
synced 2026-05-15 03:38:04 +00:00
167 lines
6.4 KiB
Python
167 lines
6.4 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Optional, Union
|
|
|
|
from datasets import (Dataset, DatasetBuilder, DatasetDict, IterableDataset,
|
|
IterableDatasetDict)
|
|
from datasets import load_dataset as hf_data_loader
|
|
|
|
from modelscope.hub.api import ModelScopeConfig
|
|
from modelscope.msdatasets.auth.auth_config import OssAuthConfig
|
|
from modelscope.msdatasets.context.dataset_context_config import \
|
|
DatasetContextConfig
|
|
from modelscope.msdatasets.data_files.data_files_manager import \
|
|
DataFilesManager
|
|
from modelscope.msdatasets.meta.data_meta_manager import DataMetaManager
|
|
from modelscope.utils.constant import DatasetFormations
|
|
|
|
|
|
class BaseDataLoader(ABC):
|
|
"""Base dataset loader to load data."""
|
|
def __init__(self, dataset_context_config: DatasetContextConfig):
|
|
self.dataset_context_config = dataset_context_config
|
|
|
|
@abstractmethod
|
|
def process(self):
|
|
"""The entity processing pipeline for fetching the data. """
|
|
raise NotImplementedError(
|
|
f'No default implementation provided for {BaseDataLoader.__name__}.process.'
|
|
)
|
|
|
|
@abstractmethod
|
|
def _authorize(self):
|
|
raise NotImplementedError(
|
|
f'No default implementation provided for {BaseDataLoader.__name__}._authorize.'
|
|
)
|
|
|
|
@abstractmethod
|
|
def _build(self):
|
|
raise NotImplementedError(
|
|
f'No default implementation provided for {BaseDataLoader.__name__}._build.'
|
|
)
|
|
|
|
@abstractmethod
|
|
def _prepare_and_download(self):
|
|
raise NotImplementedError(
|
|
f'No default implementation provided for {BaseDataLoader.__name__}._prepare_and_download.'
|
|
)
|
|
|
|
@abstractmethod
|
|
def _post_process(self):
|
|
raise NotImplementedError(
|
|
f'No default implementation provided for {BaseDataLoader.__name__}._post_process.'
|
|
)
|
|
|
|
|
|
class OssDataLoader(BaseDataLoader):
|
|
def __init__(self, dataset_context_config: DatasetContextConfig):
|
|
super().__init__(dataset_context_config)
|
|
|
|
self.data_files_builder: Optional[DataFilesManager] = None
|
|
self.dataset: Optional[Union[Dataset, IterableDataset, DatasetDict,
|
|
IterableDatasetDict]] = None
|
|
self.builder: Optional[DatasetBuilder] = None
|
|
self.data_files_manager: Optional[DataFilesManager] = None
|
|
|
|
def process(self) -> None:
|
|
""" Sequential data fetching process: authorize -> build -> prepare_and_download -> post_process,
|
|
to keep dataset_context_config updated. """
|
|
|
|
self._authorize()
|
|
self._build()
|
|
self._prepare_and_download()
|
|
self._post_process()
|
|
|
|
def _authorize(self) -> None:
|
|
""" Authorization of target dataset.
|
|
Get credentials from cache and send to the modelscope-hub in the future. """
|
|
# TODO: obtain credentials from loacl cache when available.
|
|
cookies = ModelScopeConfig.get_cookies()
|
|
git_token = ModelScopeConfig.get_token()
|
|
user_info = ModelScopeConfig.get_user_info()
|
|
|
|
if not self.dataset_context_config.auth_config:
|
|
auth_config = OssAuthConfig(cookies=cookies,
|
|
git_token=git_token,
|
|
user_info=user_info)
|
|
else:
|
|
auth_config = self.dataset_context_config.auth_config
|
|
auth_config.cookies = cookies
|
|
auth_config.git_token = git_token
|
|
auth_config.user_info = user_info
|
|
|
|
self.dataset_context_config.auth_config = auth_config
|
|
|
|
def _build(self) -> None:
|
|
""" Sequential data files building process: build_meta -> build_data_files , to keep context_config updated. """
|
|
# Build meta data
|
|
meta_manager = DataMetaManager(self.dataset_context_config)
|
|
meta_manager.fetch_meta_files()
|
|
meta_manager.parse_dataset_structure()
|
|
self.dataset_context_config = meta_manager.dataset_context_config
|
|
|
|
# Build data-files manager
|
|
self.data_files_manager = DataFilesManager(
|
|
dataset_context_config=self.dataset_context_config)
|
|
self.builder = self.data_files_manager.get_data_files_builder()
|
|
|
|
def _prepare_and_download(self) -> None:
|
|
""" Fetch data-files from modelscope dataset-hub. """
|
|
dataset_py_script = self.dataset_context_config.data_meta_config.dataset_py_script
|
|
dataset_formation = self.dataset_context_config.data_meta_config.dataset_formation
|
|
dataset_name = self.dataset_context_config.dataset_name
|
|
subset_name = self.dataset_context_config.subset_name
|
|
version = self.dataset_context_config.version
|
|
split = self.dataset_context_config.split
|
|
data_dir = self.dataset_context_config.data_dir
|
|
data_files = self.dataset_context_config.data_files
|
|
cache_dir = self.dataset_context_config.cache_root_dir
|
|
download_mode = self.dataset_context_config.download_mode
|
|
input_kwargs = self.dataset_context_config.config_kwargs
|
|
|
|
if self.builder is None and not dataset_py_script:
|
|
raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
|
|
|
|
if dataset_py_script and dataset_formation == DatasetFormations.hf_compatible:
|
|
self.dataset = hf_data_loader(dataset_py_script,
|
|
name=subset_name,
|
|
revision=version,
|
|
split=split,
|
|
data_dir=data_dir,
|
|
data_files=data_files,
|
|
cache_dir=cache_dir,
|
|
download_mode=download_mode.value,
|
|
ignore_verifications=True,
|
|
**input_kwargs)
|
|
else:
|
|
self.dataset = self.data_files_manager.fetch_data_files(
|
|
self.builder)
|
|
|
|
def _post_process(self) -> None:
|
|
...
|
|
|
|
|
|
class MaxComputeDataLoader(BaseDataLoader):
|
|
"""Data loader for MaxCompute data source."""
|
|
|
|
# TODO: MaxCompute data source to be supported .
|
|
def __init__(self, dataset_context_config: DatasetContextConfig):
|
|
super().__init__(dataset_context_config)
|
|
self.dataset = None
|
|
|
|
def process(self):
|
|
...
|
|
|
|
def _authorize(self):
|
|
...
|
|
|
|
def _build(self):
|
|
...
|
|
|
|
def _prepare_and_download(self):
|
|
...
|
|
|
|
def _post_process(self):
|
|
...
|