Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support writing the geoarrow-based encodings of GeoParquet #3275

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ New features and improvements:
the specified columns (#3101).
- Added support to ``read_file`` for the ``mask`` keyword for the pyogrio engine (#3062).
- Added support to ``read_file`` for the ``columns`` keyword for the fiona engine (#3133).
- Added support to ``read_parquet`` for reading files using the GeoArrow-based native geometry encoding of GeoParquet 1.1 (#3253).
- Added support to ``to_parquet`` and ``read_parquet`` for writing and reading files
using the GeoArrow-based native geometry encoding of GeoParquet 1.1 (#3253).
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
- Add `sort` keyword to `clip` method for GeoSeries and GeoDataFrame to allow optional
preservation of the original order of observations. (#3233)
- Added `show_bbox`, `drop_id` and `to_wgs84` arguments to allow further customization of
Expand Down
10 changes: 8 additions & 2 deletions geopandas/geodataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1228,7 +1228,7 @@
"""
from geopandas.io.geoarrow import ArrowTable, geopandas_to_arrow

table = geopandas_to_arrow(
table, _ = geopandas_to_arrow(
self,
index=index,
geometry_encoding=geometry_encoding,
Expand All @@ -1242,8 +1242,9 @@
path,
index=None,
compression="snappy",
schema_version=None,
geometry_encoding="WKB",

Check warning on line 1245 in geopandas/geodataframe.py

View check run for this annotation

Codecov / codecov/patch

geopandas/geodataframe.py#L1245

Added line #L1245 was not covered by tests
write_covering_bbox=False,
schema_version=None,
**kwargs,
):
"""Write a GeoDataFrame to the Parquet format.
Expand All @@ -1265,6 +1266,10 @@
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
Expand Down Expand Up @@ -1303,6 +1308,7 @@
self,
path,
compression=compression,
geometry_encoding=geometry_encoding,
index=index,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
Expand Down
51 changes: 39 additions & 12 deletions geopandas/io/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"multipolygon",
]
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS

# reference: https://github.com/opengeospatial/geoparquet

# Metadata structure:
Expand Down Expand Up @@ -110,7 +111,9 @@
return sorted([_geometry_type_names[idx] for idx in geometry_types])


def _create_metadata(df, schema_version=None, write_covering_bbox=False):
def _create_metadata(
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
):
"""Create and encode geo metadata dict.

Parameters
Expand All @@ -128,8 +131,13 @@
-------
dict
"""

schema_version = schema_version or METADATA_VERSION
if schema_version is None:
if geometry_encoding and any(

Check warning on line 135 in geopandas/io/arrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/arrow.py#L134-L135

Added lines #L134 - L135 were not covered by tests
encoding != "WKB" for encoding in geometry_encoding.values()
):
schema_version = "1.1.0"

Check warning on line 138 in geopandas/io/arrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/arrow.py#L138

Added line #L138 was not covered by tests
else:
schema_version = METADATA_VERSION

Check warning on line 140 in geopandas/io/arrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/arrow.py#L140

Added line #L140 was not covered by tests

if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
Expand Down Expand Up @@ -158,7 +166,7 @@
_remove_id_from_member_of_ensembles(crs)

column_metadata[col] = {
"encoding": "WKB",
"encoding": geometry_encoding[col],
"crs": crs,
geometry_types_name: geometry_types,
}
Expand All @@ -181,7 +189,7 @@
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version or METADATA_VERSION,
"version": schema_version,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}

Expand Down Expand Up @@ -315,7 +323,13 @@
raise ValueError("Metadata for bbox column is malformed.")


def _geopandas_to_arrow(df, index=None, schema_version=None, write_covering_bbox=None):
def _geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=None,
):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
Expand All @@ -325,13 +339,20 @@

_validate_dataframe(df)

# create geo metadata before altering incoming data frame
geo_metadata = _create_metadata(
df, schema_version=schema_version, write_covering_bbox=write_covering_bbox
)
if schema_version is not None:
if geometry_encoding != "WKB" and schema_version != "1.1.0":
raise ValueError(

Check warning on line 344 in geopandas/io/arrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/arrow.py#L342-L344

Added lines #L342 - L344 were not covered by tests
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
)

table = geopandas_to_arrow(
df, geometry_encoding="WKB", index=index, interleaved=True
table, geometry_encoding_dict = geopandas_to_arrow(

Check warning on line 348 in geopandas/io/arrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/arrow.py#L348

Added line #L348 was not covered by tests
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
)
geo_metadata = _create_metadata(

Check warning on line 351 in geopandas/io/arrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/arrow.py#L351

Added line #L351 was not covered by tests
df,
schema_version=schema_version,
geometry_encoding=geometry_encoding_dict,
write_covering_bbox=write_covering_bbox,
)

if write_covering_bbox:
Expand All @@ -355,6 +376,7 @@
path,
index=None,
compression="snappy",
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=False,
**kwargs,
Expand Down Expand Up @@ -383,6 +405,10 @@
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
Expand Down Expand Up @@ -412,6 +438,7 @@
table = _geopandas_to_arrow(
df,
index=index,
geometry_encoding=geometry_encoding,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
)
Expand Down
10 changes: 9 additions & 1 deletion geopandas/io/geoarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@

table = pa.Table.from_pandas(df_attr, preserve_index=index)

geometry_encoding_dict = {}

Check warning on line 124 in geopandas/io/geoarrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/geoarrow.py#L124

Added line #L124 was not covered by tests

if geometry_encoding.lower() == "geoarrow":
if Version(pa.__version__) < Version("10.0.0"):
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
Expand All @@ -135,6 +137,11 @@
interleaved=interleaved,
)
table = table.set_column(i, field, geom_arr)
geometry_encoding_dict[col] = (

Check warning on line 140 in geopandas/io/geoarrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/geoarrow.py#L140

Added line #L140 was not covered by tests
field.metadata[b"ARROW:extension:name"]
.decode()
.removeprefix("geoarrow.")
)

elif geometry_encoding.lower() == "wkb":
# Encode all geometry columns to WKB
Expand All @@ -143,12 +150,13 @@
np.asarray(df[col].array), field_name=col, crs=df[col].crs
)
table = table.set_column(i, field, wkb_arr)
geometry_encoding_dict[col] = "WKB"

Check warning on line 153 in geopandas/io/geoarrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/geoarrow.py#L153

Added line #L153 was not covered by tests

else:
raise ValueError(
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
)
return table
return table, geometry_encoding_dict

Check warning on line 159 in geopandas/io/geoarrow.py

View check run for this annotation

Codecov / codecov/patch

geopandas/io/geoarrow.py#L159

Added line #L159 was not covered by tests


def construct_wkb_array(
Expand Down
39 changes: 34 additions & 5 deletions geopandas/io/tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def file_format(request):

def test_create_metadata(naturalearth_lowres):
df = read_file(naturalearth_lowres)
metadata = _create_metadata(df)
metadata = _create_metadata(df, geometry_encoding={"geometry": "WKB"})

assert isinstance(metadata, dict)
assert metadata["version"] == METADATA_VERSION
Expand All @@ -89,6 +89,10 @@ def test_create_metadata(naturalearth_lowres):
assert metadata["creator"]["library"] == "geopandas"
assert metadata["creator"]["version"] == geopandas.__version__

# specifying non-WKB encoding sets default schema to 1.1.0
metadata = _create_metadata(df, geometry_encoding={"geometry": "point"})
assert metadata["version"] == "1.1.0"


def test_create_metadata_with_z_geometries():
geometry_types = [
Expand Down Expand Up @@ -133,18 +137,18 @@ def test_create_metadata_with_z_geometries():
],
},
)
metadata = _create_metadata(df)
metadata = _create_metadata(df, geometry_encoding={"geometry": "WKB"})
assert sorted(metadata["columns"]["geometry"]["geometry_types"]) == sorted(
geometry_types
)
# only 3D geometries
metadata = _create_metadata(df.iloc[1::2])
metadata = _create_metadata(df.iloc[1::2], geometry_encoding={"geometry": "WKB"})
assert all(
geom_type.endswith(" Z")
for geom_type in metadata["columns"]["geometry"]["geometry_types"]
)

metadata = _create_metadata(df.iloc[5:7])
metadata = _create_metadata(df.iloc[5:7], geometry_encoding={"geometry": "WKB"})
assert metadata["columns"]["geometry"]["geometry_types"] == [
"MultiPolygon",
"Polygon Z",
Expand All @@ -169,11 +173,17 @@ def test_crs_metadata_datum_ensemble():
assert pyproj.CRS(crs_json) == crs


def test_write_metadata_invalid_spec_version():
def test_write_metadata_invalid_spec_version(tmp_path):
gdf = geopandas.GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="EPSG:4326")
with pytest.raises(ValueError, match="schema_version must be one of"):
_create_metadata(gdf, schema_version="invalid")

with pytest.raises(
ValueError,
match="'geoarrow' encoding is only supported with schema version >= 1.1.0",
):
gdf.to_parquet(tmp_path, schema_version="1.0.0", geometry_encoding="geoarrow")


def test_encode_metadata():
metadata = {"a": "b"}
Expand Down Expand Up @@ -340,6 +350,7 @@ def test_to_parquet_does_not_pass_engine_along(mock_to_parquet):
df,
"",
compression="snappy",
geometry_encoding="WKB",
index=None,
schema_version=None,
write_covering_bbox=False,
Expand Down Expand Up @@ -997,6 +1008,24 @@ def test_read_parquet_geoarrow(geometry_type):
assert_geodataframe_equal(result, expected, check_crs=True)


@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_roundtrip(tmp_path, geometry_type):

df = geopandas.read_parquet(
DATA_PATH
/ "arrow"
/ "geoparquet"
/ f"data-{geometry_type}-encoding_wkb.parquet"
)

df.to_parquet(tmp_path / "test.parquet", geometry_encoding="geoarrow")
result = geopandas.read_parquet(tmp_path / "test.parquet")
assert_geodataframe_equal(result, df, check_crs=True)


def test_to_parquet_bbox_structure_and_metadata(tmpdir, naturalearth_lowres):
# check metadata being written for covering.
from pyarrow import parquet
Expand Down