From 35bbe09bc390e0c84ea15a864f08b6873de9484e Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 17 Feb 2022 11:40:07 -0500 Subject: [PATCH 1/5] wrote quick overview page --- docs/source/conf.py | 2 + docs/source/index.rst | 14 +++--- docs/source/quick-overview.rst | 78 ++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 docs/source/quick-overview.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index e89e2656..3171f142 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -45,6 +45,8 @@ "sphinx.ext.intersphinx", "sphinx.ext.extlinks", "sphinx.ext.napoleon", + "IPython.sphinxext.ipython_console_highlighting", + "IPython.sphinxext.ipython_directive", ] extlinks = { diff --git a/docs/source/index.rst b/docs/source/index.rst index fa160410..4ba28904 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,17 +1,21 @@ Datatree ======== -**Datatree is a WIP implementation of a tree-like hierarchical data structure for xarray.** +**Datatree is a prototype implementation of a tree-like hierarchical data structure for xarray.** .. toctree:: :maxdepth: 2 :caption: Documentation Contents - installation - tutorial - api - contributing + Installation + Quick Overview + Tutorial + API Reference + How do I ... + Contributing Guide + Development Roadmap + GitHub repository Feedback -------- diff --git a/docs/source/quick-overview.rst b/docs/source/quick-overview.rst new file mode 100644 index 00000000..8d7ea8f0 --- /dev/null +++ b/docs/source/quick-overview.rst @@ -0,0 +1,78 @@ +############## +Quick overview +############## + +DataTrees +--------- + +:py:class:`DataTree` is a tree-like container of ``DataArray`` objects, organised into multiple mutually alignable groups. +You can think of it like a (recursive) ``dict`` of ``Dataset`` objects. + +Let's first make some example xarray datasets (following on from xarray's +`quick overview `_ page): + +.. ipython:: python + + import numpy as np + import xarray as xr + + data = xr.DataArray(np.random.randn(2, 3), dims=("x", "y"), coords={"x": [10, 20]}) + ds = xr.Dataset(dict(foo=data, bar=("x", [1, 2]), baz=np.pi)) + ds + + ds2 = ds.interp(coords={'x': [10, 12, 14, 16, 18, 20]}) + ds2 + + ds3 = xr.Dataset(dict(people=["alice", "bob"], heights=("people", [1.57, 1.82])), coords={"species": "human"}) + ds3 + +Now we'll put this data into a multi-group tree: + +.. ipython:: python + + from datatree import DataTree + + dt = DataTree.from_dict({'root/simulation/coarse': ds, 'root/simulation/fine': ds2, 'root': ds3}) + print(dt) + +This creates a datatree with various groups. We have one root group (named ``root``), containing information about individual people. +The root group then has one subgroup ``simulation``, which contains no data itself but does contain another two subgroups, +named ``fine`` and ``coarse``. + +The (sub-)sub-groups ``fine`` and ``coarse`` contain two very similar datasets. +They both have an ``"x"`` dimension, but the dimension is of different lengths in each group, which makes the data in each group unalignable. +In (``root``) we placed some completely unrelated information, showing how we can use a tree to store heterogenous data. + +The constraints on each group are therefore the same as the constraint on dataarrays within a single dataset. + +We created the sub-groups using a filesystem-like syntax, and accessing groups works the same way. +We can access individual dataarrays in a similar fashion + +.. ipython:: python + + dt['simulation/coarse/foo'] + +and we can also pull out the data in a particular group as a ``Dataset`` object using ``.ds``: + +.. ipython:: python + + dt['simulation/coarse'].ds + +Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by + +.. ipython:: python + + avg = dt['simulation'].mean(dim="x") + print(avg) + +Here the ``"x"`` dimension used is always the one local to that sub-group. + +You can do almost everything you can do with ``Dataset`` objects with ``DataTree`` objects +(including indexing and arithmetic), as operations will be mapped over every sub-group in the tree. +This allows you to work with multiple groups of non-alignable variables at once. + +.. note:: + + If all of your variables are mutually alignable + (i.e. they live on the same grid, such that every common dimension name maps to the same length), + then you probably don't need :py:class:`DataTree`, and should consider just sticking with ``xarray.Dataset``. \ No newline at end of file From b76bf9fa50b1f97fd35487a36e40dab2d1aebed3 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 17 Feb 2022 11:56:23 -0500 Subject: [PATCH 2/5] extremely basic intallation instructions --- docs/source/installation.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index c4e4c7fc..e16d2d57 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -2,4 +2,21 @@ Installation ============ -Coming soon! +Datatree is not yet available on pypi or via conda, so for now you will have to install it from source. + +``git clone https://github.com/TomNicholas/datatree.git``` + +``pip install -e ./datatree/`` + +The main branch will be kept up-to-date, so if you clone main and run the test suite with ``pytest datatree`` and get no failures, +then you have the most up-to-date version. + +You will need xarray and `anytree `_ +as dependencies, with netcdf4, zarr, and h5netcdf as optional dependencies to allow file I/O. + +.. note:: + + Datatree is very much still in the early stages of development. There may be functions that are present but whose + internals are not yet implemented, or significant changes to the API in future. + That said, if you try it out and find some behaviour that looks like a bug to you, please report it on the + `issue tracker `_! \ No newline at end of file From 7f7785c6ad2ab2645b10255dc20d098e8f02bd45 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 17 Feb 2022 11:56:48 -0500 Subject: [PATCH 3/5] version 0.0.1 --- docs/source/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 3171f142..5a9c0403 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -78,9 +78,9 @@ # built documents. # # The short X.Y version. -version = "0.0.0" # datatree.__version__ +version = "0.0.1" # datatree.__version__ # The full version, including alpha/beta/rc tags. -release = "0.0.0" # datatree.__version__ +release = "0.0.1" # datatree.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From d3a15f85afc4a6e600adfa9812bc64089e3ed5d0 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 17 Feb 2022 11:57:13 -0500 Subject: [PATCH 4/5] updated with .from_dict constructor change --- docs/source/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index f0a56cc0..5398aff8 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -11,7 +11,6 @@ DataTree :toctree: generated/ DataTree - DataNode Attributes ---------- @@ -51,6 +50,7 @@ Methods .. autosummary:: :toctree: generated/ + DataTree.from_dict DataTree.load DataTree.compute DataTree.persist From a741bbbf40f4529933af70191327109abf944547 Mon Sep 17 00:00:00 2001 From: Thomas Nicholas Date: Thu, 17 Feb 2022 12:00:27 -0500 Subject: [PATCH 5/5] linting --- docs/source/installation.rst | 2 +- docs/source/quick-overview.rst | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index e16d2d57..e2cfeae1 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -19,4 +19,4 @@ as dependencies, with netcdf4, zarr, and h5netcdf as optional dependencies to al Datatree is very much still in the early stages of development. There may be functions that are present but whose internals are not yet implemented, or significant changes to the API in future. That said, if you try it out and find some behaviour that looks like a bug to you, please report it on the - `issue tracker `_! \ No newline at end of file + `issue tracker `_! diff --git a/docs/source/quick-overview.rst b/docs/source/quick-overview.rst index 8d7ea8f0..b5ea1d1f 100644 --- a/docs/source/quick-overview.rst +++ b/docs/source/quick-overview.rst @@ -20,10 +20,13 @@ Let's first make some example xarray datasets (following on from xarray's ds = xr.Dataset(dict(foo=data, bar=("x", [1, 2]), baz=np.pi)) ds - ds2 = ds.interp(coords={'x': [10, 12, 14, 16, 18, 20]}) + ds2 = ds.interp(coords={"x": [10, 12, 14, 16, 18, 20]}) ds2 - ds3 = xr.Dataset(dict(people=["alice", "bob"], heights=("people", [1.57, 1.82])), coords={"species": "human"}) + ds3 = xr.Dataset( + dict(people=["alice", "bob"], heights=("people", [1.57, 1.82])), + coords={"species": "human"}, + ) ds3 Now we'll put this data into a multi-group tree: @@ -32,7 +35,9 @@ Now we'll put this data into a multi-group tree: from datatree import DataTree - dt = DataTree.from_dict({'root/simulation/coarse': ds, 'root/simulation/fine': ds2, 'root': ds3}) + dt = DataTree.from_dict( + {"root/simulation/coarse": ds, "root/simulation/fine": ds2, "root": ds3} + ) print(dt) This creates a datatree with various groups. We have one root group (named ``root``), containing information about individual people. @@ -50,19 +55,19 @@ We can access individual dataarrays in a similar fashion .. ipython:: python - dt['simulation/coarse/foo'] + dt["simulation/coarse/foo"] and we can also pull out the data in a particular group as a ``Dataset`` object using ``.ds``: .. ipython:: python - dt['simulation/coarse'].ds + dt["simulation/coarse"].ds Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by .. ipython:: python - avg = dt['simulation'].mean(dim="x") + avg = dt["simulation"].mean(dim="x") print(avg) Here the ``"x"`` dimension used is always the one local to that sub-group. @@ -75,4 +80,4 @@ This allows you to work with multiple groups of non-alignable variables at once. If all of your variables are mutually alignable (i.e. they live on the same grid, such that every common dimension name maps to the same length), - then you probably don't need :py:class:`DataTree`, and should consider just sticking with ``xarray.Dataset``. \ No newline at end of file + then you probably don't need :py:class:`DataTree`, and should consider just sticking with ``xarray.Dataset``.