diff --git a/tensorflow_datasets/community-datasets.toml b/tensorflow_datasets/community-datasets.toml new file mode 100644 index 00000000000..c60bf6f4c0f --- /dev/null +++ b/tensorflow_datasets/community-datasets.toml @@ -0,0 +1,3 @@ +[Namespaces] +# You can add your own datasets here to register them in TFDS. See details +# at: https://www.tensorflow.org/datasets/community diff --git a/tensorflow_datasets/core/community/__init__.py b/tensorflow_datasets/core/community/__init__.py new file mode 100644 index 00000000000..f8ef68a5335 --- /dev/null +++ b/tensorflow_datasets/core/community/__init__.py @@ -0,0 +1,30 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Community public API.""" + +from tensorflow_datasets.core.community.register import community_config_path +from tensorflow_datasets.core.community.register import COMMUNITY_EXPORTED_PATH +from tensorflow_datasets.core.community.dataset_spec import DatasetSource +from tensorflow_datasets.core.community.dataset_spec import DatasetSpec +from tensorflow_datasets.core.community.dataset_spec import GithubSource + +__all__ = [ + 'community_config_path', + 'COMMUNITY_EXPORTED_PATH', + 'DatasetSource', + 'DatasetSpec', + 'GithubSource', +] diff --git a/tensorflow_datasets/core/community/dataset_spec.py b/tensorflow_datasets/core/community/dataset_spec.py new file mode 100644 index 00000000000..321737e4567 --- /dev/null +++ b/tensorflow_datasets/core/community/dataset_spec.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Community utils.""" + +import abc +from typing import ClassVar, Dict + +import dataclasses +from tensorflow_datasets.core import github_api +from tensorflow_datasets.core import utils + +Json = utils.Json + + +class DatasetSource(abc.ABC): + """Source indicating the dataset location (abstract class). + + Additional user-defined sources can be registered by subclassing this class. + + Attributes: + SCHEME: URI scheme (e.g. `github://`). + """ + + # Abstract class attribute + SCHEME: ClassVar[str] + + # Use non-mutable dict to prevent collision if two subclass try to use the + # same scheme + _subclasses: Dict[str, 'DatasetSource'] = utils.NonMutableDict() + + def __init_subclass__(cls, **kwargs): + """Subclasses are automatically registered.""" + super().__init_subclass__(**kwargs) + cls._subclasses[cls.SCHEME] = cls # Subclasses should have a unique SCHEME + + @classmethod + @abc.abstractmethod + def from_json(cls, value: Json) -> 'DatasetSource': + """Factory which will instancite the source from the registered class. + + ``` + source = DatasetSource.from_json({'type': 'github://', ...}) + assert isinstance(source, GithubSource) + ``` + + Args: + value: Json dict containing the constructor information. + + Returns: + The created source instance. + """ + source_type = dict(value).pop('scheme') + subclass = cls._subclasses.get(source_type) + if subclass is None: + raise ValueError( + f'Invalid source type {source_type} of: {value}\n' + f'Supported: {list(cls._subclasses)}' + ) + return subclass.from_json(value) + + @abc.abstractmethod + def to_json(self) -> Json: + """Exports the object to Json. Subclasses should call `super()`.""" + return {'scheme': self.SCHEME} + + +@dataclasses.dataclass +class GithubSource(DatasetSource): + """Dataset loaded from Github. + + Attributes: + path: The github path of the dataset + SCHEME: See parent class + """ + path: github_api.GithubPath + + SCHEME: ClassVar[str] = 'github://' # pylint: disable=invalid-name + + @classmethod + def from_json(cls, value: Json): + return cls(path=github_api.GithubPath(value['path'])) + + def to_json(self) -> Json: + value = super().to_json() + value['path'] = str(self.path) + return value + + +@dataclasses.dataclass(frozen=True) +class DatasetSpec: + """Contains specs required to lazily load a dataset. + + The specs match the `COMMUNITY_EXPORTED_PATH` content (one row == one spec) + + Attributes: + name: dataset name (e.g. `mnist`) + namespace: user/organization namespace (e.g. `mlds`) + source: Location of the dataset (e.g. Github) + """ + name: str + namespace: str + source: DatasetSource + + @classmethod + def from_json(cls, value: Json) -> 'DatasetSpec': + """Load the specs from a Json dict.""" + return cls( + name=value['name'], + namespace=value['namespace'], + source=DatasetSource.from_json(value['source']), + ) + + def to_json(self) -> Json: + """Export the specs as a Json dict.""" + return { + 'name': self.name, + 'namespace': self.namespace, + 'source': self.source.to_json(), + } + + @property + def cannonical_name(self) -> str: + """Returns the `namespace/dataset_name` string.""" + return f'{self.namespace}/{self.name}' diff --git a/tensorflow_datasets/core/community/dataset_spec_test.py b/tensorflow_datasets/core/community/dataset_spec_test.py new file mode 100644 index 00000000000..993f589bec9 --- /dev/null +++ b/tensorflow_datasets/core/community/dataset_spec_test.py @@ -0,0 +1,58 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for tensorflow_datasets.core.community.dataset_specs.""" + +from tensorflow_datasets.core import github_api +from tensorflow_datasets.core.community import dataset_spec + + +def test_import_export_json_source(): + p = github_api.GithubPath('/tensorflow/graphics/tree/path/to/datasets') + source = dataset_spec.GithubSource(p) + + json_source = source.to_json() + assert json_source == { + 'scheme': 'github://', + 'path': '/tensorflow/graphics/tree/path/to/datasets', + } + + reconstructed_source = dataset_spec.DatasetSource.from_json(json_source) + assert isinstance(reconstructed_source, dataset_spec.GithubSource) + assert json_source == reconstructed_source.to_json() + + +def test_import_export_json_spec(): + p = github_api.GithubPath('/tensorflow/graphics/tree/path/to/datasets') + spec = dataset_spec.DatasetSpec( + name='mnist', + namespace='tensorflow_graphics', + source=dataset_spec.GithubSource(p), + ) + assert spec.cannonical_name == 'tensorflow_graphics/mnist' + + json_spec = spec.to_json() + assert json_spec == { + 'name': 'mnist', + 'namespace': 'tensorflow_graphics', + 'source': { + 'scheme': 'github://', + 'path': '/tensorflow/graphics/tree/path/to/datasets', + }, + } + + reconstructed_spec = dataset_spec.DatasetSpec.from_json(json_spec) + assert isinstance(reconstructed_spec.source, dataset_spec.GithubSource) + assert json_spec == reconstructed_spec.to_json() diff --git a/tensorflow_datasets/core/community/register.py b/tensorflow_datasets/core/community/register.py new file mode 100644 index 00000000000..cbeeda00fb3 --- /dev/null +++ b/tensorflow_datasets/core/community/register.py @@ -0,0 +1,29 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Community datasets register.""" + +from tensorflow_datasets.core import utils + + +# Community datasets are parsed from the config files and exported on GCS +COMMUNITY_EXPORTED_PATH = utils.gcs_path('community-datasets-list.jsonl') + + +def community_config_path() -> str: + """Returns the community config path.""" + # Is dynamically loaded as it is only required by specific scripts so may + # not always be present. + return utils.get_tfds_path('community-datasets.toml') diff --git a/tensorflow_datasets/core/github_api/__init__.py b/tensorflow_datasets/core/github_api/__init__.py new file mode 100644 index 00000000000..ca2f823bf03 --- /dev/null +++ b/tensorflow_datasets/core/github_api/__init__.py @@ -0,0 +1,22 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Github util API.""" + +from tensorflow_datasets.core.github_api.github_path import GithubPath + +__all__ = [ + 'GithubPath', +] diff --git a/tensorflow_datasets/core/github_api/github_path.py b/tensorflow_datasets/core/github_api/github_path.py new file mode 100644 index 00000000000..797b2b27621 --- /dev/null +++ b/tensorflow_datasets/core/github_api/github_path.py @@ -0,0 +1,327 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Github pathlib-like util.""" + +import enum +import functools +import os +import pathlib +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union + +import requests + +# TODO(pytype): Should be recursive +Json = Union[str, int, bool, float, List[Any], Dict[str, Any]] + + +class _PathType(enum.Enum): + """Path type (See: https://developer.github.com/v3/git/trees/#tree-object). + + Attributes: + FILE: File + DIRECTORY: Directory + COMMIT: Git submodule (https://git-scm.com/book/en/v2/Git-Tools-Submodules) + """ + FILE = enum.auto() + DIRECTORY = enum.auto() + SUBMODULE = enum.auto() + + +class _PathMetadata: + """Class storing the Github metadata for a file/directory. + + Note: + + * _PathMetadata are cached, so two path pointing to the same file will + only launch one query. + * Attributes are dynamically fetched from the github API only when + requested to avoid unecessary queries. + * Directory also cache entries for the childs to reduce the + number of queries. For instance, `[f for f in p.iterdir() if f.is_file()]` + only use a single query in `iterdir()`, rather than one per `is_file()`. + + Attributes: + repo: e.g. `tensorflow/datasets` + branch: e.g. `master` + subpath: e.g. `core/__init__.py` + """ + + @staticmethod + @functools.lru_cache(maxsize=None) + def from_cache(path: str) -> '_PathMetadata': + """Factory which cache metadata (to avoid querying API multiple times).""" + # In the future, the cache might contains the full file content, this might + # grow big. We should add a cleanup mechanism (use weakref ?). + return _PathMetadata(path, private=True) + + def __init__(self, path: str, *, private=False): + if not private: + raise AssertionError( + 'Metadata should be created using `_PathMetadata.from_cache`' + ) + repo, branch, subpath = _parse_github_path(path) # pytype: disable=name-error + + # Read-only attributes + self._path: str = path + self.repo: str = repo # e.g. `tensorflow/datasets` + self.branch: str = branch # e.g. `master` + self.subpath: str = subpath # e.g 'core/__init__.py' + + # Dynamically loaded properties + self._exists: Optional[bool] = None + self._type: Optional[_PathType] = None # FILE, DIRECTORY, SUBMODULE + self._childs: Optional[List[str]] = None # ['README.md', 'docs',...] + + @property + def type(self) -> _PathType: + """Type of the path (file, dir, submodule).""" + if not self._type: + self._init_and_cache_content() + return self._type + + def _set_type_from_str(self, value: str) -> None: + """Sets or validates the file type. + + This is called in `_init_and_cache_content` either by `self` or the parent + directory. + + If the type is already set, this function make sure the new type match. + + Args: + value: The github type string (see: + https://developer.github.com/v3/repos/contents/ for available values) + """ + str_to_type = { + 'file': _PathType.FILE, + 'dir': _PathType.DIRECTORY, + } + if value not in str_to_type: + raise ValueError(f'Unsuported file type: {value} for {self._path}') + new_type = str_to_type[value] + if self._type and self._type is not new_type: + raise AssertionError( + f'Cannot overwrite type {self._type} with {new_type} for {self._path}' + ) + self._type = new_type + + def listdir(self) -> List[str]: + """Returns the filenames in the directory (e.g. `['.gitignore', 'src']`).""" + if self.type != _PathType.DIRECTORY: + raise NotADirectoryError(f'{self._path} is not a directory.') + # self.type could have been computed by the parent dir, so + # `_init_and_cache_content` may not have been called yet. + if self._childs is None: + self._init_and_cache_content() + return self._childs + + def exists(self) -> bool: + """Returns True if the file/dir exists.""" + if self._exists is not None: + return self._exists + elif self._type: # If type has been set, the file/dir exists + return True + else: + try: + self._init_and_cache_content() + self._exists = True + except FileNotFoundError: + self._exists = False + return self._exists + + def _init_and_cache_content(self) -> None: + """Query github to get the file/directory content. + + See doc at: https://developer.github.com/v3/repos/contents/ + + Note: + + * After this function is called, `_type` and `_childs` (for directories) + are guarantee to be initialized. + * For directory, it will create a new `_PathMetadata` entry per + child (to cache the filetype). + + """ + # e.g. 'https://api.github.com/repos/tensorflow/datasets/contents/docs' + url = ( + f'https://api.github.com/repos/{self.repo}/contents/{self.subpath}' + f'?ref={self.branch}' + ) + data = self._query_github(url) + if isinstance(data, list): # Directory + self._init_directory(data) + elif isinstance(data, dict): # File + self._init_file(data) + else: + raise AssertionError(f'Unknown content: {data}') + + def _init_directory(self, data: Json) -> None: + """Set the dynamic fields.""" + self._type = _PathType.DIRECTORY + self._childs = [f['name'] for f in data] + + # Create or update the child metadata type + for f in data: + metadata = _PathMetadata.from_cache(f"{self._path}/{f['name']}") + metadata._set_type_from_str(f['type']) # pylint: disable=protected-access + + def _init_file(self, data: Json) -> None: + self._set_type_from_str(data['type']) + + def _query_github(self, url: str) -> Json: + """Launches a github API query and returns the result.""" + # Get the secret API token to avoid the 60 calls/hour limit + # To get the current quota or test the token: + # curl -H "Authorization: token ${GITHUB_TOKEN}" https://api.github.com/rate_limit # pylint: disable=line-too-long + token = os.environ.get('GITHUB_TOKEN') + headers = {} + if token: + headers['Authorization'] = f'token {token}' + resp = requests.get(url, headers=headers) + if resp.status_code != 200: + raise FileNotFoundError( + f'Request failed for {self._path}:\n' + f' Request: {url}\n' + f' Error: {resp.status_code}\n' + f' Reason: {resp.content}', + ) + return resp.json() + + def __repr__(self) -> str: + return f'{type(self).__name__}({self._path})' + + +class GithubPath(pathlib.PurePosixPath): + """`pathlib.Path` like object for manipulating Github paths. + + Example: + + ``` + path = GithubPath.from_repo('tensorflow/datasets') + path = path / 'docs' / 'catalog' + assert path.is_dir() + datasets = [ + p.name for p in path.iterdir() if p.match('*.md') + ] + + path = GithubPath('/tensorflow/datasets/tree/master/docs/README.md') + assert path.subpath == 'docs/README.md' + assert path.repo == 'tensorflow/datasets' + assert path.branch == 'master' + ``` + + """ + _metadata: _PathMetadata # Additional file metadata + + @classmethod + def from_repo(cls, repo: str, branch: str = 'master') -> 'GithubPath': + """Factory to creates a GithubPath from a repo name. + + Args: + repo: Repo name (e.g. `tensorflow/datasets`) + branch: Branch name (e.g. `master`, 'v1.2.0', '0d240e8b85c'). Default to + master. + + Returns: + github_path: The repository root dir at head + """ + return cls(f'/{repo}/tree/{branch}') + + def _init(self, *args, **kwargs): + """Constructor.""" + # Currently, the best way for subclassing `pathlib` objects is to + # overload `_init` (see: https://bugs.python.org/issue41109) + # Future Python version may have a cleaner Path extension system: + # https://discuss.python.org/t/make-pathlib-extensible/3428/24 + super()._init(*args, **kwargs) # pytype: disable=attribute-error + # The metadata object manage the cache and will dynamically query Github + # API as needed. + self._metadata = _PathMetadata.from_cache(str(self)) + + @property + def subpath(self) -> str: + """The inner path (e.g. `core/__init__.py`).""" + return self._metadata.subpath + + @property + def repo(self) -> str: + """The repository identifier (e.g. `tensorflow/datasets`).""" + return self._metadata.repo + + @property + def branch(self) -> str: + """The branch (e.g. `master`, `v2`, `43bbad116df`,...).""" + return self._metadata.branch + + def as_raw_url(self) -> str: + """Returns the raw content url (https://raw.githubusercontent.com).""" + return ( + 'https://raw.githubusercontent.com/' + f'{self.repo}/{self.branch}/{self.subpath}' + ) + + def iterdir(self) -> Iterator['GithubPath']: + """Yields the sub-paths.""" + for filename in self._metadata.listdir(): + yield self / filename + + def is_dir(self) -> bool: + """Returns True if the path is a directory or submodule.""" + return self._metadata.type in (_PathType.DIRECTORY, _PathType.SUBMODULE) + + def is_file(self) -> bool: + """Returns True if the path is a file.""" + return self._metadata.type is _PathType.FILE + + def exists(self) -> bool: + """Returns True if the path exists.""" + return self._metadata.exists() + + +def _parse_github_path(path: str) -> Tuple[str, str, str]: + """Parse the absolute github path. + + Args: + path: The full github path. + + Returns: + repo: The repository identifiant. + branch: Repository branch. + subpath: The inner path. + + Raises: + ValueError: If the path is invalid + """ + if path.endswith('/'): + raise ValueError( + f'Invalid github path: {path}. Trailing `/` not supported.' + ) + parts = pathlib.PurePosixPath(path).parts + if len(parts) < 5: + raise ValueError( + f'Invalid github path: {path}. Expected format: ' + '`///tree/[/]`.' + ) + + # '/', 'tensorflow', 'datasets', 'tree', 'master', ... + root, owner, repo, tree, branch, *subpath = parts + if root != '/' or tree != 'tree': + raise ValueError( + f'Invalid github path: {path}. Expected format: ' + '`///tree/[/]`. Note that `/blob/` isn\'t ' + 'accepted. Only `/tree/`.' + ) + + return f'{owner}/{repo}', branch, '/'.join(subpath) diff --git a/tensorflow_datasets/core/github_api/github_path_test.py b/tensorflow_datasets/core/github_api/github_path_test.py new file mode 100644 index 00000000000..73bd0229f92 --- /dev/null +++ b/tensorflow_datasets/core/github_api/github_path_test.py @@ -0,0 +1,190 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Github API util tests.""" + +import contextlib +from unittest import mock + +import pytest + +from tensorflow_datasets.core.github_api import github_path + + +_SKIP_NON_HERMETIC = False + +# Non hermetic tests are explicitly marked and skipped if `_SKIP_NON_HERMETIC` +# is True. +non_hermetic_test = pytest.mark.skipif( + _SKIP_NON_HERMETIC, reason='Non-hermetic test skipped.', +) + +_original_query_github = github_path._PathMetadata._query_github + + +@pytest.fixture(scope='module', autouse=True) +def assert_no_api_call(): + """Globally disable github API calls.""" + with mock.patch.object( + github_path._PathMetadata, + '_query_github', + side_effect=AssertionError('Forbidden API call'), + ): + yield + + +@contextlib.contextmanager +def enable_api_call(): + """Contextmanager which locally re-enable API calls.""" + with mock.patch.object( + github_path._PathMetadata, '_query_github', _original_query_github + ): + yield + + +def test_parse_github_path(): + url = '/tensorflow/datasets/tree/master/docs/README.md' + repo, branch, path = github_path._parse_github_path(url) + assert repo == 'tensorflow/datasets' + assert branch == 'master' + assert path == 'docs/README.md' + + url = '/tensorflow/datasets/tree/master' + repo, branch, path = github_path._parse_github_path(url) + assert repo == 'tensorflow/datasets' + assert branch == 'master' + assert path == '' # pylint: disable=g-explicit-bool-comparison + + +def test_invalid_github_path(): + + with pytest.raises(ValueError, match='Invalid github path'): + github_path.GithubPath() + + with pytest.raises(ValueError, match='Invalid github path'): + github_path.GithubPath('') + + with pytest.raises(ValueError, match='Invalid github path'): + github_path.GithubPath('/not/a/path') + + with pytest.raises(ValueError, match='Invalid github path'): + github_path.GithubPath('/tensorflow/tree/master/docs/README.md') + + # `blob` isn't accepted for consistency between paths. + with pytest.raises(ValueError, match='Invalid github path'): + github_path.GithubPath('/tensorflow/datasets/blob/master/docs/README.md') + + p = github_path.GithubPath('/tensorflow/datasets/tree/master/docs/README.md') + p = p.parent # /docs + p = p.parent # / + with pytest.raises(ValueError, match='Invalid github path'): + p.parent # pylint: disable=pointless-statement + + +def test_github_path_purepath(): + """Tests that pathlib methods works as expected.""" + p = github_path.GithubPath('/tensorflow/datasets/tree/master/') + sub_p = p / 'some_folder' + assert isinstance(sub_p, github_path.GithubPath) + assert str(p) == '/tensorflow/datasets/tree/master' + assert p == github_path.GithubPath.from_repo('tensorflow/datasets') + + +def test_github_path_as_url(): + p = github_path.GithubPath.from_repo('tensorflow/datasets', 'v3.1.0') + p /= 'README.md' + expected = 'https://raw.githubusercontent.com/tensorflow/datasets/v3.1.0/README.md' + assert p.as_raw_url() == expected + + +@non_hermetic_test +def test_github_api_listdir(): + """Test query github API.""" + # PurePath ops do not trigger API calls + p = github_path.GithubPath.from_repo('tensorflow/datasets', 'v3.1.0') + p = p / 'tensorflow_datasets' / 'testing' + + with enable_api_call(): + sub_dirs = sorted(p.iterdir()) + + # `listdir` call cache the filetype of all childs + all_dir_names = [d.name for d in sub_dirs if d.is_dir()] + all_file_names = [d.name for d in sub_dirs if d.is_file()] + all_names = [d.name for d in sub_dirs] + + with pytest.raises(NotADirectoryError): + list((p / '__init__.py').iterdir()) + + assert all_names == [ + '__init__.py', + 'dataset_builder_testing.py', + 'dataset_builder_testing_test.py', + 'fake_data_generation', + 'fake_data_utils.py', + 'generate_archives.sh', + 'metadata', + 'mocking.py', + 'mocking_test.py', + 'test_case.py', + 'test_data', + 'test_utils.py', + 'test_utils_test.py', + ] + assert all_dir_names == [ + 'fake_data_generation', + 'metadata', + 'test_data', + ] + assert all_file_names == [ + '__init__.py', + 'dataset_builder_testing.py', + 'dataset_builder_testing_test.py', + 'fake_data_utils.py', + 'generate_archives.sh', + 'mocking.py', + 'mocking_test.py', + 'test_case.py', + 'test_utils.py', + 'test_utils_test.py', + ] + + +@non_hermetic_test +def test_github_api_exists(): + """Test query github API.""" + p = github_path.GithubPath.from_repo('tensorflow/datasets', 'v3.1.0') + with enable_api_call(): + assert p.exists() + assert not (p / 'unnknown_dir').exists() + + readme = p / 'README.md' + core = p / 'tensorflow_datasets' / 'core' + with enable_api_call(): + assert readme.is_file() + assert core.is_dir() + + # Data should have been cached (no API calls required) + assert not readme.is_dir() + assert not core.is_file() + assert readme.exists() + assert core.exists() + # Recreating a new Path reuse the cache + assert (core.parent.parent / 'README.md').is_file() + assert (core.parent.parent / 'README.md')._metadata is readme._metadata + + +def test_assert_no_api_call(): + with pytest.raises(AssertionError, match='Forbidden API call'): + github_path.GithubPath.from_repo('tensorflow/datasets', 'v1.0.0').exists() diff --git a/tensorflow_datasets/core/utils/__init__.py b/tensorflow_datasets/core/utils/__init__.py index d3ad53125e4..37bbbc70372 100644 --- a/tensorflow_datasets/core/utils/__init__.py +++ b/tensorflow_datasets/core/utils/__init__.py @@ -22,6 +22,7 @@ from tensorflow_datasets.core.utils.py_utils import * from tensorflow_datasets.core.utils.tf_utils import * from tensorflow_datasets.core.utils.tqdm_utils import * +from tensorflow_datasets.core.utils.type_utils import * from tensorflow_datasets.core.utils.version import Experiment from tensorflow_datasets.core.utils.version import Version # pylint: enable=wildcard-import diff --git a/tensorflow_datasets/core/utils/type_utils.py b/tensorflow_datasets/core/utils/type_utils.py new file mode 100644 index 00000000000..532d5f05699 --- /dev/null +++ b/tensorflow_datasets/core/utils/type_utils.py @@ -0,0 +1,27 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Type utils.""" + +from typing import Any, Dict, List, Union + +__all__ = [ + 'Json', + 'JsonValue', +] + +# TODO(pytype): Should use recursive type +JsonValue = Union[str, int, float, bool, Dict[str, Any], List[Any]] +Json = Dict[str, JsonValue] diff --git a/tensorflow_datasets/scripts/deployment/export_community_datasets.py b/tensorflow_datasets/scripts/deployment/export_community_datasets.py new file mode 100644 index 00000000000..d5652263a14 --- /dev/null +++ b/tensorflow_datasets/scripts/deployment/export_community_datasets.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Script which parse registered repositories and save datasets found.""" + +import itertools +import json +import pathlib +from typing import List + +from absl import app +import tensorflow as tf + +from tensorflow_datasets.core import community +from tensorflow_datasets.core import github_api +import toml + + +def _is_dataset_path(ds_path: github_api.GithubPath) -> bool: + """Returns True if the given path correspond to a dataset. + + Currently a simple heuristic is used. This function checks the path has the + following structure: + + ``` + / + .py + ``` + + Args: + ds_path: Path of the dataset module + + Returns: + True if the path match the expected file structure + """ + return ds_path.is_dir() and (ds_path / f'{ds_path.name}.py').exists() + + +def _list_namespace_ds_specs( + namespace: str, + path: str, +) -> List[community.DatasetSpec]: + """Returns the dataset names found in a specific directory. + + The directory should have the following structure: + + ``` + / + / + / + ... + ``` + + Additional files or folders which are not detected as datasets will be + ignored (e.g. `__init__.py`). + + Args: + namespace: Namespace of the datasets + path: The directory path containing the datasets. + + Returns: + ds_specs: The dataset specs found in the directory (sorted for determinism). + + Raises: + FileNotFoundError: If the path cannot be reached. + """ + path = github_api.GithubPath(path) + if not path.exists(): + # Should be fault-tolerant in the future + raise FileNotFoundError(f'Could not find datasets at {path}') + all_specs = [ + community.DatasetSpec( # pylint: disable=g-complex-comprehension + name=ds_path.name, + namespace=namespace, + source=community.GithubSource(ds_path), + ) for ds_path in path.iterdir() if _is_dataset_path(ds_path) + ] + return sorted(all_specs, key=lambda spec: spec.cannonical_name) + + +def _find_community_ds_specs( + config_path: pathlib.Path, +) -> List[community.DatasetSpec]: + """Find all namepaces/dataset from the config. + + Config should contain the instructions in the following format: + + ``` + [Namespace] + = '//tree/' + = '//tree/' + ``` + + Args: + config_path: Path to the config file containing lookup instructions. + + Returns: + ds_specs: list of all found datasets. + """ + config = toml.load(config_path) + all_specs = itertools.chain.from_iterable( + _list_namespace_ds_specs(namespace, path) + for namespace, path in config['Namespaces'].items() + ) + return sorted(all_specs, key=lambda spec: spec.cannonical_name) + + +def _save_community_ds_specs( + file_path: str, ds_specs: List[community.DatasetSpec] +) -> None: + """Save all loaded datasets. + + Saved file will have the following `.tsv` format: + + ``` + namespace0 dataset0 /path/to/dataset0/ + namespace0 dataset1 /path/to/dataset1/ + ... + ``` + + Args: + file_path: `.jsonl` destination to which save the dataset + ds_specs: Dataset paths to save + """ + # TODO(tfds): Replace GFile by a pathlib-like abstraction for GCS. + with tf.io.gfile.GFile(file_path, 'w') as f: + for spec in ds_specs: + f.write(json.dumps(spec.to_json())) + f.write('\n') + + +def export_community_datasets(in_path: pathlib.Path, out_path: str) -> None: + """Exports community datasets. + + Args: + in_path: Config path containing the namespaces and dataset lookup + instructions. + out_path: File containing all detected datasets. Detected dataset will + be saved to this file. Previous content is erased. + """ + ds_specs = _find_community_ds_specs(in_path) + _save_community_ds_specs(out_path, ds_specs) + + +def main(_): + config_path = pathlib.Path(community.community_config_path()) + exported_path = community.COMMUNITY_EXPORTED_PATH + export_community_datasets(in_path=config_path, out_path=exported_path) + + +if __name__ == '__main__': + app.run(main) diff --git a/tensorflow_datasets/scripts/deployment/export_community_datasets_test.py b/tensorflow_datasets/scripts/deployment/export_community_datasets_test.py new file mode 100644 index 00000000000..96eeebe4a7e --- /dev/null +++ b/tensorflow_datasets/scripts/deployment/export_community_datasets_test.py @@ -0,0 +1,86 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for external datasets.""" + +import pathlib +import string +import textwrap +from typing import List + +from unittest import mock + +from tensorflow_datasets.core import github_api +from tensorflow_datasets.scripts.deployment import export_community_datasets + + +def _write_dataset_files( + root_path: pathlib.Path, namespace: str, datasets: List[str] +) -> str: + """Write the repo content containing the datasets.""" + repo_path = root_path / namespace + # Create all datasets + for ds_name in datasets: + ds_path = repo_path / ds_name / f'{ds_name}.py' + ds_path.parent.mkdir(parents=True) # Create the containing dir + ds_path.touch() # Create the file + + # Additional noisy files should be ignored + (repo_path / '__init__.py').touch() + (repo_path / 'empty_dir').mkdir() + return str(repo_path) + + +def test_export_community_datasets(tmp_path): + + # Create the community dataset repositories + tfg_path = _write_dataset_files( + tmp_path, namespace='tensorflow_graphics', datasets=['cifar'] + ) + nlp_path = _write_dataset_files( + tmp_path, namespace='nlp', datasets=['mnist', 'robotnet'] + ) + + # Write a dummy `community-datasets.toml` + in_path = tmp_path / 'config.toml' + in_path.write_text( + textwrap.dedent( + f"""\ + [Namespaces] + tensorflow_graphics = '{tfg_path}' + nlp = '{nlp_path}' + """ + ) + ) + + # Load registered dataset and export the list. + # We patch `GithubPath` with `pathlib.Path` as the two have the same API. + out_path = tmp_path / 'out.tsv' + with mock.patch.object(github_api, 'GithubPath', pathlib.Path): + export_community_datasets.export_community_datasets(in_path, str(out_path)) + + # Ensure datasets where correctly exported + expected_output = textwrap.dedent( + """\ + {"name": "mnist", "namespace": "nlp", "source": {"scheme": "github://", "path": "${nlp_path}/mnist"}} + {"name": "robotnet", "namespace": "nlp", "source": {"scheme": "github://", "path": "${nlp_path}/robotnet"}} + {"name": "cifar", "namespace": "tensorflow_graphics", "source": {"scheme": "github://", "path": "${tfg_path}/cifar"}} + """ + ) + expected_output = string.Template(expected_output).substitute( + tfg_path=tfg_path, + nlp_path=nlp_path, + ) + assert out_path.read_text() == expected_output