{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"# Manual Join"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:40.166270Z",
"iopub.status.busy": "2022-09-15T05:07:40.165762Z",
"iopub.status.idle": "2022-09-15T05:07:41.803358Z",
"shell.execute_reply": "2022-09-15T05:07:41.801917Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import starepandas\n",
"import geopandas\n",
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:41.808861Z",
"iopub.status.busy": "2022-09-15T05:07:41.808187Z",
"iopub.status.idle": "2022-09-15T05:07:41.832162Z",
"shell.execute_reply": "2022-09-15T05:07:41.830968Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"cities = ['Buenos Aires', 'Brasilia', 'Santiago', \n",
" 'Bogota', 'Caracas', 'Sao Paulo', 'Bridgetown']\n",
"\n",
"latitudes = [-34.58, -15.78, -33.45, 4.60, 10.48, -23.55, 13.1]\n",
"longitudes = [-58.66, -47.91, -70.66, -74.08, -66.86, -46.63, -59.62]\n",
"data = {'City': cities, \n",
" 'Latitude': latitudes, 'Longitude': longitudes}\n",
"\n",
"cities = starepandas.STAREDataFrame(data)\n",
"stare = starepandas.sids_from_xy(cities.Longitude, cities.Latitude, level=27)\n",
"geom = geopandas.points_from_xy(cities.Longitude, cities.Latitude, crs='EPSG:4326')\n",
"cities.set_sids(stare, inplace=True)\n",
"cities.set_geometry(geom, inplace=True)\n",
"cities.add_trixels(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:41.835341Z",
"iopub.status.busy": "2022-09-15T05:07:41.834938Z",
"iopub.status.idle": "2022-09-15T05:07:47.390205Z",
"shell.execute_reply": "2022-09-15T05:07:47.389556Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"countries = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))\n",
"countries = countries.sort_values(by='name')\n",
"countries['geom_simple'] = countries.simplify(0.002)\n",
"countries.set_geometry('geom_simple', inplace=True)\n",
"samerica = countries[countries.continent=='South America']\n",
"\n",
"sids = starepandas.sids_from_gdf(samerica, level=9, force_ccw=True)\n",
"samerica = starepandas.STAREDataFrame(samerica, sids=sids)\n",
"trixels = samerica.make_trixels()\n",
"samerica.set_trixels(trixels, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:47.392753Z",
"iopub.status.busy": "2022-09-15T05:07:47.392556Z",
"iopub.status.idle": "2022-09-15T05:07:47.408242Z",
"shell.execute_reply": "2022-09-15T05:07:47.407795Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _key_left | \n",
" _key_right | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 9 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 29 | \n",
"
\n",
" \n",
" | 2 | \n",
" 5 | \n",
" 29 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 10 | \n",
"
\n",
" \n",
" | 4 | \n",
" 3 | \n",
" 32 | \n",
"
\n",
" \n",
" | 5 | \n",
" 4 | \n",
" 40 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _key_left _key_right\n",
"0 0 9\n",
"1 1 29\n",
"2 5 29\n",
"3 2 10\n",
"4 3 32\n",
"5 4 40"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left_df = cities\n",
"right_df = samerica\n",
"\n",
"left_key = []\n",
"right_key = []\n",
"for i, row in right_df.iterrows(): \n",
" k = left_df.index[left_df.intersects(row.geometry)]\n",
" left_key.extend(list(k))\n",
" right_key.extend([i]*len(k))\n",
"\n",
"indices = pandas.DataFrame({'_key_left': left_key, \n",
" '_key_right':right_key})\n",
"indices"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:47.410592Z",
"iopub.status.busy": "2022-09-15T05:07:47.410314Z",
"iopub.status.idle": "2022-09-15T05:07:47.422430Z",
"shell.execute_reply": "2022-09-15T05:07:47.422005Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" _key_left | \n",
" _key_right | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 9 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 29 | \n",
"
\n",
" \n",
" | 2 | \n",
" 5 | \n",
" 29 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 10 | \n",
"
\n",
" \n",
" | 4 | \n",
" 3 | \n",
" 32 | \n",
"
\n",
" \n",
" | 5 | \n",
" 4 | \n",
" 40 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" _key_left _key_right\n",
"0 0 9\n",
"1 1 29\n",
"2 5 29\n",
"3 2 10\n",
"4 3 32\n",
"5 4 40"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"left_key = []\n",
"right_key = []\n",
"\n",
"for i, row in right_df.iterrows(): \n",
" k = left_df.index[left_df.stare_intersects(row.sids)]\n",
" left_key.extend(list(k))\n",
" right_key.extend([i]*len(k))\n",
"\n",
"indices = pandas.DataFrame({'_key_left': left_key, \n",
" '_key_right': right_key})\n",
"\n",
"indices"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:47.424523Z",
"iopub.status.busy": "2022-09-15T05:07:47.424287Z",
"iopub.status.idle": "2022-09-15T05:07:47.432888Z",
"shell.execute_reply": "2022-09-15T05:07:47.432011Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Inner join\n",
"joined = left_df\n",
"joined = joined.merge(indices, left_index=True, right_index=True)\n",
"joined = joined.merge(right_df, left_on='_key_right', right_index=True)\n",
"joined = joined.set_index('_key_left')\n",
"joined = joined.drop([\"_key_right\"], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2022-09-15T05:07:47.435369Z",
"iopub.status.busy": "2022-09-15T05:07:47.435099Z",
"iopub.status.idle": "2022-09-15T05:07:47.445857Z",
"shell.execute_reply": "2022-09-15T05:07:47.445322Z"
},
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Left Join\n",
"index_left = 'index_left'\n",
"left_df.index = left_df.index.rename(index_left)\n",
"left_df = left_df.reset_index()\n",
"\n",
"joined = left_df\n",
"joined = joined.merge(indices, left_index=True, right_index=True, how=\"left\")\n",
"joined = joined.merge(right_df.drop(right_df.geometry.name, axis=1),\n",
" how=\"left\",\n",
" left_on=\"_key_right\",\n",
" right_index=True,\n",
" suffixes=(\"_left\", \"_right\"))\n",
"joined = joined.set_index(index_left)\n",
"joined = joined.drop([\"_key_right\"], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}