{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "# Working with dask and xarray" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from pyhdf.SD import SD\n", "import numpy\n", "import pystare\n", "import xarray\n", "import dask.distributed\n", "import dask.dataframe\n", "import datetime" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:05.790079Z", "iopub.status.busy": "2022-09-15T05:04:05.789842Z", "iopub.status.idle": "2022-09-15T05:04:07.005682Z", "shell.execute_reply": "2022-09-15T05:04:07.004909Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "client = dask.distributed.Client(n_workers=4)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:07.008305Z", "iopub.status.busy": "2022-09-15T05:04:07.008034Z", "iopub.status.idle": "2022-09-15T05:04:07.019864Z", "shell.execute_reply": "2022-09-15T05:04:07.019368Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "file_path = '../tests/data/granules/MOD05_L2.A2019336.0000.061.2019336211522.hdf'\n", "hdf = SD(file_path)\n", "lon = hdf.select('Longitude').get().astype(numpy.double)\n", "lat = hdf.select('Latitude').get().astype(numpy.double)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:07.022532Z", "iopub.status.busy": "2022-09-15T05:04:07.022195Z", "iopub.status.idle": "2022-09-15T05:04:08.014397Z", "shell.execute_reply": "2022-09-15T05:04:08.013807Z" }, "pycharm": { "name": "#%%\n" }, "is_executing": true }, "outputs": [], "source": [ "start = datetime.datetime.now()\n", "stare = pystare.from_latlon_2d(lat=lat, \n", " lon=lon, \n", " adapt_level=True)\n", "print(datetime.datetime.now()-start)" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Dask" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:08.017246Z", "iopub.status.busy": "2022-09-15T05:04:08.017013Z", "iopub.status.idle": "2022-09-15T05:04:08.034405Z", "shell.execute_reply": "2022-09-15T05:04:08.033890Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Array Chunk
Bytes 1.67 MiB 1.67 MiB
Shape (2, 406, 270) (2, 406, 270)
Count 1 Graph Layer 1 Chunks
Type float64 numpy.ndarray
\n", "
\n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", "\n", " \n", " \n", "\n", " \n", " 270\n", " 406\n", " 2\n", "\n", "
" ], "text/plain": [ "dask.array" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coords = numpy.array([lat, lon])\n", "coords_d = dask.array.from_array(coords, chunks=(2,500,1354))\n", "coords_d" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:08.036832Z", "iopub.status.busy": "2022-09-15T05:04:08.036531Z", "iopub.status.idle": "2022-09-15T05:04:08.042313Z", "shell.execute_reply": "2022-09-15T05:04:08.041005Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def stare(coords):\n", " return pystare.from_latlon_2d(coords[0], \n", " coords[1], adapt_level=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:08.045583Z", "iopub.status.busy": "2022-09-15T05:04:08.045029Z", "iopub.status.idle": "2022-09-15T05:04:09.280746Z", "shell.execute_reply": "2022-09-15T05:04:09.279852Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "s_d = coords_d.map_blocks(stare, drop_axis=[0], chunks=(100, 1354), dtype='int64')\n", "s_d = s_d.compute()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Xarray Ufunc" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:09.283424Z", "iopub.status.busy": "2022-09-15T05:04:09.283081Z", "iopub.status.idle": "2022-09-15T05:04:09.289045Z", "shell.execute_reply": "2022-09-15T05:04:09.288459Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "lat_x = xarray.DataArray(lat, dims=['x', 'y']).chunk({'x': 500})\n", "lon_x = xarray.DataArray(lon, dims=['x', 'y']).chunk({'x': 500})" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:09.291343Z", "iopub.status.busy": "2022-09-15T05:04:09.291104Z", "iopub.status.idle": "2022-09-15T05:04:10.334858Z", "shell.execute_reply": "2022-09-15T05:04:10.334369Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0:00:01.039442\n" ] } ], "source": [ "start = datetime.datetime.now()\n", "s_d = xarray.apply_ufunc(pystare.from_latlon_2d, \n", " lat_x,\n", " lon_x,\n", " dask='parallelized',\n", " output_dtypes=[numpy.int64])\n", "\n", "sids = numpy.array(s_d)\n", "print(datetime.datetime.now()-start)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:10.337169Z", "iopub.status.busy": "2022-09-15T05:04:10.336915Z", "iopub.status.idle": "2022-09-15T05:04:10.342896Z", "shell.execute_reply": "2022-09-15T05:04:10.341916Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/plain": [ "array([[4298473764500464809, 4298458168380511209, 4297394569014717897,\n", " ..., 3604325910693007273, 3604468594879342953,\n", " 3604495833162833193],\n", " [4298462872969244297, 4298459225563237225, 4297297422977447753,\n", " ..., 3604330264741384009, 3604471380516185641,\n", " 3604465738696115433],\n", " [4298462873435275369, 4298459227962358473, 4297297429637206121,\n", " ..., 3604322952727773225, 3604471381825883401,\n", " 3604465733841987657],\n", " ...,\n", " [3652144132972193481, 3650323462937407913, 3650325177740030185,\n", " ..., 3727730728598789545, 3727841631302055049,\n", " 3727831398613792009],\n", " [3652144129926505097, 3650323400334252041, 3650325178786309321,\n", " ..., 3727730732960989609, 3727841627078009577,\n", " 3727831398032615625],\n", " [3652167198498770729, 3652159322973158121, 3650318911383240361,\n", " ..., 3727838256925064969, 3727843063731949801,\n", " 3727853163225616425]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sids" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Write Sidecar" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:10.345386Z", "iopub.status.busy": "2022-09-15T05:04:10.344987Z", "iopub.status.idle": "2022-09-15T05:04:10.351917Z", "shell.execute_reply": "2022-09-15T05:04:10.351389Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import netCDF4\n", "rootgrp = netCDF4.Dataset('test.nc', \"w\", format=\"NETCDF4\")\n", "\n", "rootgrp.close()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Dask DataFrame" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:10.354289Z", "iopub.status.busy": "2022-09-15T05:04:10.353915Z", "iopub.status.idle": "2022-09-15T05:04:10.362364Z", "shell.execute_reply": "2022-09-15T05:04:10.361797Z" }, "pycharm": { "name": "#%%\n" }, "scrolled": true }, "outputs": [], "source": [ "band1 = hdf.select('Water_Vapor_Infrared').get().astype(numpy.double)\n", "lat = hdf.select('Latitude').get().astype(numpy.double)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2022-09-15T05:04:10.364834Z", "iopub.status.busy": "2022-09-15T05:04:10.364561Z", "iopub.status.idle": "2022-09-15T05:04:10.760465Z", "shell.execute_reply": "2022-09-15T05:04:10.759846Z" }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "
Dask DataFrame Structure:
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
band1
npartitions=4
3604081108103418377float64
3618019075911588075...
3661533202196794217...
3736859491218877322...
4298544093115426153...
\n", "
\n", "
Dask Name: sort_index, 7 graph layers
" ], "text/plain": [ "Dask DataFrame Structure:\n", " band1\n", "npartitions=4 \n", "3604081108103418377 float64\n", "3618019075911588075 ...\n", "3661533202196794217 ...\n", "3736859491218877322 ...\n", "4298544093115426153 ...\n", "Dask Name: sort_index, 7 graph layers" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas\n", "df = pandas.DataFrame({'stare': sids.flatten(), 'band1': band1.flatten()})\n", "ddf = dask.dataframe.from_pandas(df, npartitions=4)\n", "ddf.set_index('stare')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 4 }