STAREPandas Multiprocessing#

[1]:
import dask
import numpy
import starepandas
import geopandas
import pystare
import pandas
import matplotlib.pyplot as plt
import datetime
import time
[2]:
us = geopandas.read_file('../tests/data/tl_2017_us_state.gpkg')
us = us.to_crs('EPSG:4326')
len(us)
[2]:
56
[3]:
us.head()
us = us[0:6]

Parallel SID lookup#

[4]:
def sids_from_geoseries(series, level, convex=False, force_ccw=True, n_workers=1):
    sids = []
    for geom in series:
        sids_row = starepandas.sids_from_shapely(geom=geom, level=level, convex=convex, force_ccw=force_ccw)
        sids.append(sids_row)
    sids = numpy.array(sids, dtype='object')  # Has to be object to suppress VisibleDeprecationWarning
    return sids
[5]:
n_workers = 2

ddf = dask.dataframe.from_pandas(us.geometry, npartitions=n_workers)
meta = {'sids': 'uint64'}

res = ddf.map_partitions(lambda df: numpy.array(sids_from_geoseries(df, level=7, convex=False, force_ccw=False, n_workers=1), dtype='object'),
                         meta=meta)

sids = res.compute(scheduler='processes')

High level#

[6]:
stare = starepandas.sids_from_geoseries(us.geometry,
                                        level=7,
                                        convex=False,
                                        force_ccw=True,
                                        n_partitions=4)

Parallel Trixel lookup#

Manual / low level#

[7]:
n_cores = 4

us['stare'] = stare
ddf = dask.dataframe.from_pandas(us['stare'], npartitions=n_cores)
meta = {'trixels': 'object'}

res = ddf.map_partitions(lambda df: numpy.array(starepandas.trixels_from_stareseries(df)), meta=meta)
trixels = res.compute(scheduler='processes')

High level#

[8]:
stare = pandas.Series(stare)
trixels = starepandas.trixels_from_stareseries(stare, n_partitions=4)

Parallel instantiation#

[9]:
us = starepandas.STAREDataFrame(us,
                                add_sids=True,
                                level=7,
                                add_trixels=True,
                                n_partitions=4)
[10]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)

us.plot(ax=ax, trixels=True, column='NAME')
[10]:
<AxesSubplot:>
../../_images/examples_notebooks_parallel_16_1.png

Visualization#

[11]:
us
[11]:
REGION DIVISION STATEFP STATENS GEOID STUSPS NAME LSAD MTFCC FUNCSTAT ALAND AWATER INTPTLAT INTPTLON geometry stare sids trixels
0 3 5 54 01779805 54 WV West Virginia 00 G4000 A 62265662566 489840834 +38.6472854 -080.6183274 POLYGON ((-81.74725 39.09538, -81.74635 39.096... [3137320090416971783, 3137355274789060615, 313... [3137320090416971783, 3137355274789060615, 313... MULTIPOLYGON (((-79.56820 39.46980, -79.99541 ...
1 3 5 12 00294478 12 FL Florida 00 G4000 A 138911437206 31398800291 +28.4574302 -082.4091478 MULTIPOLYGON (((-82.98748 24.62538, -82.98748 ... [3119129770047045639, 3119446429395845126, 312... [3119129770047045639, 3119446429395845126, 312... MULTIPOLYGON (((-82.52250 24.32624, -82.98181 ...
2 2 3 17 01779784 17 IL Illinois 00 G4000 A 143784114293 6211277447 +40.1028754 -089.1526108 POLYGON ((-91.18529 40.63780, -91.17510 40.643... [3072299370796810246, 3072440108285165574, 307... [3072299370796810246, 3072440108285165574, 307... MULTIPOLYGON (((-88.73033 38.14162, -88.05766 ...
3 2 4 27 00662849 27 MN Minnesota 00 G4000 A 206229176104 18944967530 +46.3158148 -094.1996628 POLYGON ((-96.78438 46.63050, -96.78434 46.630... [3077084445400891397, 3077647395354312710, 307... [3077084445400891397, 3077647395354312710, 307... MULTIPOLYGON (((-92.56837 47.54242, -96.82683 ...
4 3 5 24 01714934 24 MD Maryland 00 G4000 A 25150696145 6980371026 +38.9466584 -076.6744939 POLYGON ((-77.45881 39.22027, -77.45866 39.220... [3137320090416971783, 3164341688181194759, 316... [3137320090416971783, 3164341688181194759, 316... MULTIPOLYGON (((-79.56820 39.46980, -79.99541 ...
5 1 1 44 01219835 44 RI Rhode Island 00 G4000 A 2677997539 1323452846 +41.5974187 -071.5272723 MULTIPOLYGON (((-71.67881 41.15891, -71.67626 ... [3163708369483595783, 3163391710134796295, 316... [3163708369483595783, 3163391710134796295, 316... MULTIPOLYGON (((-71.71678 40.71857, -71.16875 ...
[12]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)

hawaii = us[us['NAME']=='Maryland']
hawaii.plot(ax=ax, trixels=True)
hawaii
[12]:
REGION DIVISION STATEFP STATENS GEOID STUSPS NAME LSAD MTFCC FUNCSTAT ALAND AWATER INTPTLAT INTPTLON geometry stare sids trixels
4 3 5 24 01714934 24 MD Maryland 00 G4000 A 25150696145 6980371026 +38.9466584 -076.6744939 POLYGON ((-77.45881 39.22027, -77.45866 39.220... [3137320090416971783, 3164341688181194759, 316... [3137320090416971783, 3164341688181194759, 316... MULTIPOLYGON (((-79.56820 39.46980, -79.99541 ...
../../_images/examples_notebooks_parallel_19_1.png

Parallel dissolve#

[13]:
us_regions = us.stare_dissolve(by='REGION', aggfunc='sum', num_workers=4, geom=False)
[14]:
trixels = us_regions.make_trixels(n_partitions=4)
us_regions.set_trixels(trixels, inplace=True)
[15]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)

ax.set_xlim(-170, -50)
ax.set_ylim(0, 80)

us_regions.reset_index().plot(ax=ax, trixels=True, column='REGION')
[15]:
<AxesSubplot:>
../../_images/examples_notebooks_parallel_23_1.png

Parallel Intersects#

Cover#

[16]:
us = starepandas.STAREDataFrame(us, add_sids=True,
                                level=6, add_trixels=True, n_partitions=4)
[17]:
filepath = '../tests/data/granules/VNP03DNB.A2020219.0742.001.2020219124651.nc'
vnp03 = starepandas.io.granules.VNP03DNB(filepath)
vnp03.read_sidecar_cover()
vnp03_cover = vnp03.stare_cover
vnp03_cover_df = starepandas.STAREDataFrame({'stare': vnp03.stare_cover}, sids='stare', add_trixels=True)
[18]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)

ax.set_xlim(-130, -60)
ax.set_ylim(15, 50)

us.plot(ax=ax, column='NAME', trixels=True, linewidth=0.5)
vnp03_cover_df.plot(ax=ax, trixels=True, color='k', linewidth=0.5)
[18]:
<AxesSubplot:>
../../_images/examples_notebooks_parallel_28_1.png
[19]:
def series_intersects(other, series, method=1, n_workers=1):
    """
    Returns a bool series of length len(series).
    True for every row in which row intersects other.
    """
    other = numpy.array([other]).flatten()

    if n_workers > len(series):
        # Cannot have more partitions than rows
        n_workers = len(series)

    if n_workers == 1:
        if series.dtype in [numpy.dtype('uint64'), numpy.dtype('int64'), pandas.UInt64Dtype(), pandas.Int64Dtype()]:
            # We have a series of sids; don't need to iterate. Can send the whole array to pystare/
            intersects = pystare.intersects(other, series, method)
        else:
            intersects = []
            for sids in series:
                if len(list(sids)) < len(other):
                    # If we do method 1, larger item first is faster
                    intersects.append(pystare.intersects(other, sids, method).any())
                else:
                    intersects.append(pystare.intersects(sids, other, method).any())
            intersects = numpy.array(intersects)
    else:
        ddf = dask.dataframe.from_pandas(series, npartitions=n_workers)
        meta = {'intersects': 'bool'}
        res = ddf.map_partitions(lambda df: series_intersects(other, df, method, 1), meta=meta)
        intersects = res.compute(scheduler='processes')
    return intersects
[20]:
a = numpy.array(vnp03_cover)
[21]:
start = datetime.datetime.now()
intersects = series_intersects(a, us['stare'] , 1, 1)
print(datetime.datetime.now() - start)
0:00:00.001699
[22]:
start = datetime.datetime.now()
intersects = series_intersects(a, us['stare'], 1, 5)
print(datetime.datetime.now() - start)
0:00:00.699152
[23]:
intersects
[23]:
array([ True,  True,  True, False,  True, False])
[24]:
print(list(us[intersects]['NAME']))
['West Virginia', 'Florida', 'Illinois', 'Maryland']

iFOVs#

[25]:
import copy
filepath = '../tests/data/granules/VNP03DNB.A2020219.0742.001.2020219124651.nc'
vnp03 = starepandas.read_granule(filepath, sidecar=True, xy=True)
vnp03 = copy.copy(vnp03[vnp03.y>1500])
[26]:
state_sids = us[us['NAME']=='Florida']['stare'].iloc[0]
[27]:
start = datetime.datetime.now()
intersects = series_intersects(state_sids, vnp03['sids'])
print(datetime.datetime.now() - start)
0:00:09.251401
[28]:
start = datetime.datetime.now()
intersects = series_intersects(state_sids, vnp03['sids'], 1, 60)
print(datetime.datetime.now() - start)
0:00:06.608146
[29]:
intersects.sum()/vnp03.index.size
[29]:
0.04182729704280899
[30]:
vnp03[intersects]
[30]:
sids x y moon_illumination_fraction land_water_mask quality_flag
6102337 3397086378837675182 2273 1501 92.849998 0 0
6102338 3397086381026421070 2274 1501 92.849998 0 0
6102339 3397086404710208846 2275 1501 92.849998 0 0
6102340 3397086415242594286 2276 1501 92.849998 0 0
6102341 3397086441098915054 2277 1501 92.849998 0 0
... ... ... ... ... ... ...
9058015 3118997947546240654 3423 2228 92.879997 7 0
9058016 3118997851493397806 3424 2228 92.879997 7 0
9058017 3118997861386527694 3425 2228 92.879997 7 0
9058018 3118997831926651342 3426 2228 92.879997 7 0
9058019 3118901074887783182 3427 2228 92.879997 7 0

294246 rows × 6 columns

High level#

[31]:
intersects = vnp03.stare_intersects(other=state_sids, method=1, n_partitions=4)
[32]:
vnp03.info()
<class 'starepandas.staredataframe.STAREDataFrame'>
Int64Index: 7034784 entries, 6100064 to 13134847
Data columns (total 6 columns):
 #   Column                      Dtype
---  ------                      -----
 0   sids                        Int64
 1   x                           uint16
 2   y                           uint16
 3   moon_illumination_fraction  float32
 4   land_water_mask             UInt8
 5   quality_flag                UInt8
dtypes: Int64(1), UInt8(2), float32(1), uint16(2)
memory usage: 194.6 MB
[ ]: