Note
This page was generated from examples/notebooks/parallel.ipynb.
STAREPandas Multiprocessing#
[1]:
import dask
import numpy
import starepandas
import geopandas
import pystare
import pandas
import matplotlib.pyplot as plt
import datetime
[2]:
us_all = geopandas.read_file('../tests/data/tl_2017_us_state.gpkg')
us_all = us_all.to_crs('EPSG:4326')
[3]:
us = us_all[0:25]
us
[3]:
| REGION | DIVISION | STATEFP | STATENS | GEOID | STUSPS | NAME | LSAD | MTFCC | FUNCSTAT | ALAND | AWATER | INTPTLAT | INTPTLON | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | 5 | 54 | 01779805 | 54 | WV | West Virginia | 00 | G4000 | A | 62265662566 | 489840834 | +38.6472854 | -080.6183274 | POLYGON ((-81.74725 39.09538, -81.74635 39.096... |
| 1 | 3 | 5 | 12 | 00294478 | 12 | FL | Florida | 00 | G4000 | A | 138911437206 | 31398800291 | +28.4574302 | -082.4091478 | MULTIPOLYGON (((-82.98748 24.62538, -82.98748 ... |
| 2 | 2 | 3 | 17 | 01779784 | 17 | IL | Illinois | 00 | G4000 | A | 143784114293 | 6211277447 | +40.1028754 | -089.1526108 | POLYGON ((-91.18529 40.63780, -91.17510 40.643... |
| 3 | 2 | 4 | 27 | 00662849 | 27 | MN | Minnesota | 00 | G4000 | A | 206229176104 | 18944967530 | +46.3158148 | -094.1996628 | POLYGON ((-96.78438 46.63050, -96.78434 46.630... |
| 4 | 3 | 5 | 24 | 01714934 | 24 | MD | Maryland | 00 | G4000 | A | 25150696145 | 6980371026 | +38.9466584 | -076.6744939 | POLYGON ((-77.45881 39.22027, -77.45866 39.220... |
| 5 | 1 | 1 | 44 | 01219835 | 44 | RI | Rhode Island | 00 | G4000 | A | 2677997539 | 1323452846 | +41.5974187 | -071.5272723 | MULTIPOLYGON (((-71.67881 41.15891, -71.67626 ... |
| 6 | 4 | 8 | 16 | 01779783 | 16 | ID | Idaho | 00 | G4000 | A | 214048160737 | 2393355752 | +44.3484222 | -114.5588538 | POLYGON ((-116.89971 44.84061, -116.89967 44.8... |
| 7 | 1 | 1 | 33 | 01779794 | 33 | NH | New Hampshire | 00 | G4000 | A | 23187445452 | 1028643155 | +43.6726907 | -071.5843145 | POLYGON ((-72.32990 43.60021, -72.32984 43.600... |
| 8 | 3 | 5 | 37 | 01027616 | 37 | NC | North Carolina | 00 | G4000 | A | 125919712692 | 13470113896 | +35.5397100 | -079.1308636 | POLYGON ((-82.41674 36.07283, -82.41660 36.073... |
| 9 | 1 | 1 | 50 | 01779802 | 50 | VT | Vermont | 00 | G4000 | A | 23873457570 | 1031134839 | +44.0604795 | -072.6733274 | POLYGON ((-73.31328 44.26413, -73.31274 44.265... |
| 10 | 1 | 1 | 09 | 01779780 | 09 | CT | Connecticut | 00 | G4000 | A | 12542619303 | 1815495323 | +41.5798637 | -072.7466572 | POLYGON ((-73.51808 41.66672, -73.51807 41.666... |
| 11 | 3 | 5 | 10 | 01779781 | 10 | DE | Delaware | 00 | G4000 | A | 5047241079 | 1398670234 | +38.9986239 | -075.4416920 | POLYGON ((-75.76007 39.29682, -75.76010 39.297... |
| 12 | 4 | 8 | 35 | 00897535 | 35 | NM | New Mexico | 00 | G4000 | A | 314191415563 | 733669653 | +34.4346843 | -106.1316181 | POLYGON ((-106.00632 36.99527, -106.00531 36.9... |
| 13 | 4 | 9 | 06 | 01779778 | 06 | CA | California | 00 | G4000 | A | 403483182192 | 20484637928 | +37.1551773 | -119.5434183 | MULTIPOLYGON (((-119.63607 33.28071, -119.6347... |
| 14 | 1 | 2 | 34 | 01779795 | 34 | NJ | New Jersey | 00 | G4000 | A | 19049723313 | 3542963551 | +40.1072744 | -074.6652012 | POLYGON ((-75.18960 40.59178, -75.18977 40.592... |
| 15 | 2 | 3 | 55 | 01779806 | 55 | WI | Wisconsin | 00 | G4000 | A | 140275464079 | 29359527964 | +44.6309071 | -089.7093916 | POLYGON ((-92.88707 45.64415, -92.88671 45.644... |
| 16 | 4 | 9 | 41 | 01155107 | 41 | OR | Oregon | 00 | G4000 | A | 248604328809 | 6195045325 | +43.9717125 | -120.6229578 | POLYGON ((-124.06545 45.78305, -124.06206 45.7... |
| 17 | 2 | 4 | 31 | 01779792 | 31 | NE | Nebraska | 00 | G4000 | A | 198957965731 | 1370523694 | +41.5433014 | -099.8118616 | POLYGON ((-104.05264 42.00172, -104.05263 42.0... |
| 18 | 1 | 2 | 42 | 01779798 | 42 | PA | Pennsylvania | 00 | G4000 | A | 115881477379 | 3397554419 | +40.9024957 | -077.8334514 | POLYGON ((-80.51935 41.84956, -80.51938 41.850... |
| 19 | 4 | 9 | 53 | 01779804 | 53 | WA | Washington | 00 | G4000 | A | 172111800165 | 12560067439 | +47.4073238 | -120.5757999 | POLYGON ((-123.24792 48.28456, -123.24779 48.2... |
| 20 | 3 | 7 | 22 | 01629543 | 22 | LA | Louisiana | 00 | G4000 | A | 111904803121 | 23746413153 | +30.8634368 | -091.7987173 | POLYGON ((-92.06910 33.00816, -92.06904 33.008... |
| 21 | 3 | 5 | 13 | 01705317 | 13 | GA | Georgia | 00 | G4000 | A | 149177524294 | 4733385577 | +32.6295789 | -083.4235109 | POLYGON ((-85.38658 33.90172, -85.38659 33.901... |
| 22 | 3 | 6 | 01 | 01779775 | 01 | AL | Alabama | 00 | G4000 | A | 131174431216 | 4592944701 | +32.7396323 | -086.8434593 | POLYGON ((-88.13999 34.58170, -88.13997 34.581... |
| 23 | 4 | 8 | 49 | 01455989 | 49 | UT | Utah | 00 | G4000 | A | 212884846341 | 7000199770 | +39.3349925 | -111.6563326 | POLYGON ((-114.04703 39.90610, -114.04702 39.9... |
| 24 | 2 | 3 | 39 | 01085497 | 39 | OH | Ohio | 00 | G4000 | A | 105833282399 | 10264451012 | +40.4149297 | -082.7119975 | POLYGON ((-84.80325 40.98939, -84.80324 40.991... |
Parallel SID lookup#
[4]:
def sids_from_geoseries(series, level, convex=False, force_ccw=True, n_workers=1):
sids = []
for geom in series:
sids_row = starepandas.sids_from_shapely(geom=geom, level=level, convex=convex, force_ccw=force_ccw)
sids.append(sids_row)
sids = numpy.array(sids, dtype='object') # Has to be object to suppress VisibleDeprecationWarning
return sids
[5]:
n_workers = 2
ddf = dask.dataframe.from_pandas(us.geometry, npartitions=n_workers)
meta = {'sids': 'uint64'}
res = ddf.map_partitions(lambda df: numpy.array(sids_from_geoseries(df, level=7, convex=False, force_ccw=True, n_workers=1), dtype='object'),
meta=meta)
sids = res.compute(scheduler='processes')
High level#
[6]:
stare = starepandas.sids_from_geoseries(us.geometry,
level=7,
convex=False,
force_ccw=True,
n_partitions=4)
Parallel Trixel lookup#
Manual / low level#
[7]:
n_cores = 4
us['sids'] = stare
ddf = dask.dataframe.from_pandas(us['sids'], npartitions=n_cores)
meta = {'trixels': 'object'}
res = ddf.map_partitions(lambda df: numpy.array(starepandas.trixels_from_stareseries(df)), meta=meta)
trixels = res.compute(scheduler='processes')
/home/griessbaum/.virtualenvs/starepandas/lib/python3.10/site-packages/geopandas/geodataframe.py:1443: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
super().__setitem__(key, value)
High level#
[8]:
stare = pandas.Series(stare)
trixels = starepandas.trixels_from_stareseries(stare, n_partitions=4)
Parallel instantiation#
[9]:
us = starepandas.STAREDataFrame(us,
add_sids=True,
level=7,
add_trixels=True,
n_partitions=4)
[10]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)
us.plot(ax=ax, trixels=True, column='NAME')
[10]:
<Axes: >
Visualization#
[11]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)
hawaii = us[us['NAME']=='Maryland']
hawaii.plot(ax=ax, trixels=True)
hawaii
[11]:
| REGION | DIVISION | STATEFP | STATENS | GEOID | STUSPS | NAME | LSAD | MTFCC | FUNCSTAT | ALAND | AWATER | INTPTLAT | INTPTLON | geometry | sids | trixels | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 3 | 5 | 24 | 01714934 | 24 | MD | Maryland | 00 | G4000 | A | 25150696145 | 6980371026 | +38.9466584 | -076.6744939 | POLYGON ((-77.45881 39.22027, -77.45866 39.220... | [3137320090416971783, 3164341688181194759, 316... | MULTIPOLYGON (((-79.56820 39.46980, -79.99541 ... |
Parallel dissolve#
[12]:
us_regions = us.stare_dissolve(by='REGION', aggfunc='sum', num_workers=4, geom=False)
[13]:
trixels = us_regions.make_trixels(n_partitions=4)
us_regions.set_trixels(trixels, inplace=True)
[14]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)
ax.set_xlim(-170, -50)
ax.set_ylim(0, 80)
us_regions.reset_index().plot(ax=ax, trixels=True, column='REGION')
[14]:
<Axes: >
Parallel Intersects#
Cover#
[15]:
us = starepandas.STAREDataFrame(us, add_sids=True,
level=6, add_trixels=True, n_partitions=4)
[16]:
filepath = '../tests/data/granules/viirs/VNP03DNB.A2022308.1930.002.2022309041547.nc'
vnp03 = starepandas.io.granules.VNP03DNB(filepath)
vnp03.read_sidecar_cover()
vnp03_cover = vnp03.stare_cover
vnp03_cover_df = starepandas.STAREDataFrame({'sids': vnp03.stare_cover}, sids='sids', add_trixels=True)
[17]:
fig, ax = plt.subplots(figsize=(7,7), dpi=100)
ax.grid(True)
ax.set_xlim(-130, -60)
ax.set_ylim(15, 50)
us.plot(ax=ax, column='NAME', trixels=True, linewidth=0.5)
vnp03_cover_df.plot(ax=ax, trixels=True, color='k', linewidth=0.5)
[17]:
<Axes: >
[18]:
def series_intersects(other, series, method=1, n_workers=1):
"""
Returns a bool series of length len(series).
True for every row in which row intersects other.
"""
other = numpy.array([other]).flatten()
if n_workers > len(series):
# Cannot have more partitions than rows
n_workers = len(series)
if n_workers == 1:
if series.dtype in [numpy.dtype('uint64'), numpy.dtype('int64'), pandas.UInt64Dtype(), pandas.Int64Dtype()]:
# We have a series of sids; don't need to iterate. Can send the whole array to pystare/
intersects = pystare.intersects(other, series, method)
else:
intersects = []
for sids in series:
if len(list(sids)) < len(other):
# If we do method 1, larger item first is faster
intersects.append(pystare.intersects(other, sids, method).any())
else:
intersects.append(pystare.intersects(sids, other, method).any())
intersects = numpy.array(intersects)
else:
ddf = dask.dataframe.from_pandas(series, npartitions=n_workers)
meta = {'intersects': 'bool'}
res = ddf.map_partitions(lambda df: series_intersects(other, df, method, 1), meta=meta)
intersects = res.compute(scheduler='processes')
return intersects
[19]:
a = numpy.array(vnp03_cover)
[20]:
start = datetime.datetime.now()
intersects = series_intersects(a, us['sids'] , 1, 1)
print(datetime.datetime.now() - start)
0:00:00.005978
[21]:
start = datetime.datetime.now()
intersects = series_intersects(a, us['sids'], 1, 5)
print(datetime.datetime.now() - start)
0:00:01.347456
[22]:
intersects
[22]:
array([ True, False, True, True, True, False, True, False, True,
False, False, False, True, True, False, True, True, True,
True, True, False, False, False, True, True])
[23]:
print(list(us[intersects]['NAME']))
['West Virginia', 'Illinois', 'Minnesota', 'Maryland', 'Idaho', 'North Carolina', 'New Mexico', 'California', 'Wisconsin', 'Oregon', 'Nebraska', 'Pennsylvania', 'Washington', 'Utah', 'Ohio']
iFOVs#
[42]:
import copy
filepath = '../tests/data/granules/viirs/VNP03DNB.A2022308.1930.002.2022309041547.nc'
vnp03 = starepandas.read_granule(filepath, sidecar=True, xy=True)
vnp03 = copy.copy(vnp03[vnp03.y<1500])
[55]:
state_sids = us[us['NAME']=='Utah']['sids'].iloc[0]
state_sids
[55]:
array([3341811660997263366, 3341952398485618694, 3342093135973974022,
3342374610950684678, 3342515348439040006, 3342656085927395334,
3342937560904105990, 3343078298392461318, 3343219035880816646,
3343359773369171974, 3343500510857527302, 3343641248345882630,
3343781985834237958, 3349130010391740422, 3349270747880095750,
3349411485368451078, 3349552222856806406, 3349833697833517062,
3349974435321872390, 3350115172810227718, 3350396647786938374,
3350537385275293702])
[56]:
start = datetime.datetime.now()
intersects = series_intersects(state_sids, vnp03['sids'])
print(datetime.datetime.now() - start)
0:00:16.396078
[50]:
start = datetime.datetime.now()
intersects = series_intersects(state_sids, vnp03['sids'], 1, 60)
print(datetime.datetime.now() - start)
0:00:11.133490
[57]:
intersects.sum()/vnp03.index.size
[57]:
0.13730413385826773
[59]:
vnp03[intersects]
[59]:
| sids | x | y | moon_illumination_fraction | land_water_mask | quality_flag | |
|---|---|---|---|---|---|---|
| 1194175 | 3341882033058642606 | 3423 | 293 | 85.869995 | 1 | 0 |
| 1194176 | 3341882036842615342 | 3424 | 293 | 85.869995 | 1 | 0 |
| 1194177 | 3342163511619860078 | 3425 | 293 | 85.869995 | 1 | 0 |
| 1194178 | 3342163534744678030 | 3426 | 293 | 85.869995 | 1 | 0 |
| 1194179 | 3342163530200315950 | 3427 | 293 | 85.869995 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 6095648 | 3349284902051811182 | 3712 | 1499 | 85.809998 | 1 | 0 |
| 6095649 | 3349284897711474958 | 3713 | 1499 | 85.809998 | 1 | 0 |
| 6095650 | 3349284827668942798 | 3714 | 1499 | 85.809998 | 1 | 0 |
| 6095651 | 3349284830587971054 | 3715 | 1499 | 85.809998 | 1 | 0 |
| 6095652 | 3349284807188521198 | 3716 | 1499 | 85.809998 | 1 | 0 |
837006 rows × 6 columns
High level#
[39]:
intersects = vnp03.stare_intersects(other=state_sids, method=1, n_partitions=4)
[ ]: