Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite MVT to take constant instead of linear memory #737

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 88 additions & 34 deletions src/dso_api/dynamic_api/views/mvt.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
"""Mapbox Vector Tiles (MVT) views of geographic datasets."""

import itertools
import logging
import time

import mercantile
from django.contrib.gis.db.models import functions
from django.core.exceptions import FieldDoesNotExist, PermissionDenied
from django.http import Http404
from django.db import connection
from django.http import Http404, HttpResponse, StreamingHttpResponse
from django.urls.base import reverse
from django.views import View
from django.views.generic import TemplateView
from schematools.contrib.django.models import Dataset
from schematools.exceptions import SchemaObjectNotFound
from schematools.types import DatasetTableSchema
from vectortiles.postgis.views import MVTView
from vectortiles.postgis.functions import AsMVTGeom, MakeEnvelope

from ..datasets import get_active_datasets
from ..permissions import CheckPermissionsMixin
Expand Down Expand Up @@ -93,9 +96,15 @@ def get_context_data(self, **kwargs):
return context


class DatasetMVTView(CheckPermissionsMixin, MVTView):
class DatasetMVTView(CheckPermissionsMixin, View):
"""An MVT view for a single dataset."""

_CONTENT_TYPE = "application/vnd.mapbox-vector-tile"
_EXTENT = 4096 # Default extent for MVT.
# Name of temporary MVT row that we add to our queryset.
# No field name starting with an underscore ever occurs in our datasets.
_MVT_ROW = "_geom_prepared_for_mvt"

def setup(self, request, *args, **kwargs):
super().setup(request, *args, **kwargs)
from ..urls import router
Expand All @@ -108,58 +117,103 @@ def setup(self, request, *args, **kwargs):
except KeyError:
raise Http404(f"Invalid table: {dataset_name}.{table_name}") from None

schema: DatasetTableSchema = model.table_schema()
try:
self._main_geo = schema.main_geometry_field.python_name
except SchemaObjectNotFound as e:
raise FieldDoesNotExist(f"No field named '{schema.main_geometry}'") from e

self.model = model
self.check_permissions(request, [self.model])

def get(self, request, *args, **kwargs):
kwargs.pop("dataset_name")
kwargs.pop("table_name")
self.z = kwargs["z"]

t0 = time.perf_counter_ns()
result = super().get(request, *args, **kwargs)
logging.info(
"retrieved tile for %s (%d bytes) in %.3fs",
request.path,
len(result.content),
(time.perf_counter_ns() - t0) * 1e-9,
)
return result
x, y, z = kwargs["x"], kwargs["y"], kwargs["z"]

def get_queryset(self):
return self.model.objects.all()
tile = self._stream_tile(x, y, z)
try:
chunk = next(tile)
except StopIteration:
return HttpResponse(content=b"", content_type=self._CONTENT_TYPE, status=204)
tile = itertools.chain((chunk,), tile)

@property
def vector_tile_fields(self) -> tuple:
return StreamingHttpResponse(
streaming_content=tile, content_type=self._CONTENT_TYPE, status=200
)

def _stream_tile(self, x: int, y: int, z: int):
qs = self.model.objects.all()
bbox = MakeEnvelope(*mercantile.xy_bounds(x, y, z), 3857)
qs = qs.filter(
**{
f"{self._main_geo}__intersects": bbox,
}
)
# Add MVT row and restrict to the fields we want in the response.
qs = qs.annotate(
**{self._MVT_ROW: AsMVTGeom(functions.Transform(self._main_geo, 3857), bbox)}
)
qs = qs.values(self._MVT_ROW, *self._property_fields(z))

sql, params = qs.query.sql_with_params()
with connection.cursor() as cursor:
# This hairy query generates the MVT tile using ST_AsMVT, then breaks it up into rows
# that we can stream to the client. We do this because the tile is a potentially very
# large bytea in PostgreSQL, which psycopg would otherwise consume in its entirety
# before passing it on to us. Since psycopg also needs to keep the hex-encoded
# PostgreSQL wire format representation of the tile in memory while decoding it, it
# needs 3*n memory to decode an n-byte tile, and tiles can be up to hundreds of
# megabytes.
#
# To make matters worse, what psycopg returns is a memoryview, and Django accepts
# memoryviews just fine but casts them to bytes objects, meaning the entire tile gets
# copied again. That happens after the hex version has been decoded, but it still
# means a slow client can keep our memory use at 2*n for the duration of the request.
CHUNK_SIZE = 8192
cursor.execute( # nosec
f"""
WITH mvt AS (
SELECT ST_AsMVT(_sub.*, %s, %s, %s) FROM ({sql}) as _sub
)
SELECT * FROM (
/* This is SQL for range(1, len(x), CHUNK_SIZE), sort of. */
WITH RECURSIVE chunk(i) AS (
VALUES (1)
UNION ALL
SELECT chunk.i+{CHUNK_SIZE} FROM chunk, mvt
WHERE i < octet_length(mvt.ST_AsMVT)
),
sorted AS (SELECT * FROM chunk ORDER BY i)
SELECT substring(mvt.ST_AsMVT FROM sorted.i FOR {CHUNK_SIZE}) FROM mvt, sorted
) AS chunked_mvt WHERE octet_length(substring) > 0
""",
params=["default", self._EXTENT, self._MVT_ROW, *params],
)
for chunk in cursor:
yield chunk[0]

def _property_fields(self, z) -> tuple:
"""Returns fields to include in the tile as MVT properties alongside the geometry."""
# If we are zoomed far out (low z), only fetch the geometry field for a smaller payload.
# The cutoff is arbitrary. Play around with
# https://www.maptiler.com/google-maps-coordinates-tile-bounds-projection/#14/4.92/52.37
# to get a feel for the MVT zoom levels and how much detail a single tile should contain.
if self.z < 15:
if z < 15:
return ()

geom_name = self.vector_tile_geom_name
user_scopes = self.request.user_scopes
return tuple(
f.name
for f in self.model._meta.get_fields()
if f.name != geom_name and user_scopes.has_field_access(self.model.get_field_schema(f))
if f.name != self._main_geo
and user_scopes.has_field_access(self.model.get_field_schema(f))
)

@property
def vector_tile_geom_name(self) -> str:
schema: DatasetTableSchema = self.model.table_schema()
try:
return schema.main_geometry_field.python_name
except SchemaObjectNotFound as e:
raise FieldDoesNotExist(f"No field named '{schema.main_geometry}'") from e

def check_permissions(self, request, models) -> None:
"""Override CheckPermissionsMixin to add extra checks"""
super().check_permissions(request, models)

# Check whether the geometry field can be accessed, otherwise reading MVT is pointless.
model_field = self.model._meta.get_field(self.vector_tile_geom_name)
model_field = self.model._meta.get_field(self._main_geo)
field_schema = self.model.get_field_schema(model_field)
if not self.request.user_scopes.has_field_access(field_schema):
raise PermissionDenied()
27 changes: 22 additions & 5 deletions src/tests/test_dynamic_api/test_views_mvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import mapbox_vector_tile
import pytest
from django.contrib.gis.geos import Point
from django.http.response import HttpResponse, HttpResponseBase, StreamingHttpResponse
from django.utils.timezone import make_aware

CONTENT_TYPE = "application/vnd.mapbox-vector-tile"
Expand Down Expand Up @@ -130,7 +131,7 @@ def test_mvt_content(api_client, afval_container_model, afval_cluster, filled_ro
assert response.status_code == 200
assert response["Content-Type"] == CONTENT_TYPE

vt = mapbox_vector_tile.decode(response.content)
vt = decode_mvt(response)

assert vt == {
"default": {
Expand Down Expand Up @@ -160,11 +161,10 @@ def test_mvt_content(api_client, afval_container_model, afval_cluster, filled_ro
# Try again at a higher zoom level. We should get the same features, but with no properties.
url = "/v1/mvt/afvalwegingen/containers/14/8415/5384.pbf"
response = api_client.get(url)
# MVT view returns 204 when the tile is empty.
assert response.status_code == 200
assert response["Content-Type"] == CONTENT_TYPE

vt = mapbox_vector_tile.decode(response.content)
vt = decode_mvt(response)

assert vt == {
"default": {
Expand All @@ -181,6 +181,13 @@ def test_mvt_content(api_client, afval_container_model, afval_cluster, filled_ro
}
}

# MVT view returns 204 when the tile is empty.
url = "/v1/mvt/afvalwegingen/containers/14/0/0.pbf"
response = api_client.get(url)
assert response.status_code == 204
assert response["Content-Type"] == CONTENT_TYPE
assert response.content == b""


@pytest.mark.django_db
def test_mvt_forbidden(api_client, geometry_auth_thing, fetch_auth_token, filled_router):
Expand Down Expand Up @@ -226,7 +233,7 @@ def test_mvt_model_auth(api_client, geometry_auth_model, fetch_auth_token, fille
token = fetch_auth_token(["TEST/GEO", "TEST/META"])
response = api_client.get(url, HTTP_AUTHORIZATION=f"Bearer {token}")
assert response.status_code == 200
assert mapbox_vector_tile.decode(response.content) == content
assert decode_mvt(response) == content

# With only the GEO scope, we still get a 200 response
# but we lose access to the metadata field.
Expand All @@ -235,4 +242,14 @@ def test_mvt_model_auth(api_client, geometry_auth_model, fetch_auth_token, fille
assert response.status_code == 200

del content["default"]["features"][0]["properties"]["metadata"]
assert mapbox_vector_tile.decode(response.content) == content
assert decode_mvt(response) == content


def decode_mvt(response: HttpResponseBase) -> bytes:
if isinstance(response, HttpResponse):
content = response.content
elif isinstance(response, StreamingHttpResponse):
content = b"".join(response.streaming_content)
else:
raise TypeError(f"unexpected {type(response)}")
return mapbox_vector_tile.decode(content)