Skip to content

Commit

Permalink
Merge pull request #508 from bento-platform/features/katsu-config
Browse files Browse the repository at this point in the history
feat: project and dataset scoped discovery config
  • Loading branch information
v-rocheleau authored Jul 16, 2024
2 parents 2111abd + 7ea3925 commit 5848709
Show file tree
Hide file tree
Showing 38 changed files with 1,251 additions and 450 deletions.
24 changes: 24 additions & 0 deletions chord_metadata_service/chord/migrations/0008_v8_0_0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Generated by Django 4.2.11 on 2024-04-24 14:10

import chord_metadata_service.restapi.validators
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('chord', '0007_v7_0_0'),
]

operations = [
migrations.AddField(
model_name='dataset',
name='discovery',
field=models.JSONField(blank=True, help_text='Discovery configuration', null=True, validators=[chord_metadata_service.restapi.validators.JsonSchemaValidator({'$id': '/chord_metadata_service/discovery/discovery', 'additionalProperties': False, 'description': 'Discovery configuration for public fields/search', 'properties': {'fields': {'$id': '/chord_metadata_service/discovery/discovery_named_fields', 'additionalProperties': False, 'description': 'Intermediate schema, enforces field schema with flexible names.', 'patternProperties': {'^.*$': {'$id': '/chord_metadata_service/discovery/discovery_field', 'additionalProperties': False, 'description': 'Field configuration', 'properties': {'config': {'properties': {'bin_size': {'type': 'number'}, 'bins': {'items': {'type': 'number'}, 'type': 'array'}, 'enum': {'oneOf': [{'items': {'type': 'string'}, 'type': 'array'}, {'type': 'null'}]}, 'maximum': {'type': 'number'}, 'minimum': {'type': 'number'}, 'taper_left': {'type': 'number'}, 'taper_right': {'type': 'number'}, 'units': {'type': 'string'}}, 'type': 'object'}, 'datatype': {'enum': ['number', 'string', 'date'], 'type': 'string'}, 'description': {'type': 'string'}, 'group_by': {'type': 'string'}, 'group_by_value': {'type': 'string'}, 'mapping': {'type': 'string'}, 'mapping_for_search_filter': {'type': 'string'}, 'title': {'type': 'string'}, 'value_mapping': {'type': 'string'}}, 'type': 'object'}}, 'type': 'object'}, 'overview': {'description': 'List of overview sections', 'items': {'$id': '/chord_metadata_service/discovery/discovery_overview', 'additionalProperties': False, 'description': 'An overview section containing charts', 'properties': {'charts': {'items': {'$id': '/chord_metadata_service/discovery/discovery_overview_chart', 'additionalProperties': False, 'description': 'Associates a field name with a chart type for overview display', 'properties': {'chart_type': {'enum': ['bar', 'pie'], 'type': 'string'}, 'field': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'rules': {'properties': {'count_threshold': {'type': 'integer'}, 'max_query_parameters': {'type': 'integer'}}, 'type': 'object'}, 'search': {'items': {'$id': '/chord_metadata_service/discovery/discovery_search', 'additionalProperties': False, 'description': 'Groups search fields by section.', 'properties': {'fields': {'items': {'type': 'string'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}}, 'type': 'object'}, formats=None)]),
),
migrations.AddField(
model_name='project',
name='discovery',
field=models.JSONField(blank=True, help_text='Discovery configuration', null=True, validators=[chord_metadata_service.restapi.validators.JsonSchemaValidator({'$id': '/chord_metadata_service/discovery/discovery', 'additionalProperties': False, 'description': 'Discovery configuration for public fields/search', 'properties': {'fields': {'$id': '/chord_metadata_service/discovery/discovery_named_fields', 'additionalProperties': False, 'description': 'Intermediate schema, enforces field schema with flexible names.', 'patternProperties': {'^.*$': {'$id': '/chord_metadata_service/discovery/discovery_field', 'additionalProperties': False, 'description': 'Field configuration', 'properties': {'config': {'properties': {'bin_size': {'type': 'number'}, 'bins': {'items': {'type': 'number'}, 'type': 'array'}, 'enum': {'oneOf': [{'items': {'type': 'string'}, 'type': 'array'}, {'type': 'null'}]}, 'maximum': {'type': 'number'}, 'minimum': {'type': 'number'}, 'taper_left': {'type': 'number'}, 'taper_right': {'type': 'number'}, 'units': {'type': 'string'}}, 'type': 'object'}, 'datatype': {'enum': ['number', 'string', 'date'], 'type': 'string'}, 'description': {'type': 'string'}, 'group_by': {'type': 'string'}, 'group_by_value': {'type': 'string'}, 'mapping': {'type': 'string'}, 'mapping_for_search_filter': {'type': 'string'}, 'title': {'type': 'string'}, 'value_mapping': {'type': 'string'}}, 'type': 'object'}}, 'type': 'object'}, 'overview': {'description': 'List of overview sections', 'items': {'$id': '/chord_metadata_service/discovery/discovery_overview', 'additionalProperties': False, 'description': 'An overview section containing charts', 'properties': {'charts': {'items': {'$id': '/chord_metadata_service/discovery/discovery_overview_chart', 'additionalProperties': False, 'description': 'Associates a field name with a chart type for overview display', 'properties': {'chart_type': {'enum': ['bar', 'pie'], 'type': 'string'}, 'field': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}, 'rules': {'properties': {'count_threshold': {'type': 'integer'}, 'max_query_parameters': {'type': 'integer'}}, 'type': 'object'}, 'search': {'items': {'$id': '/chord_metadata_service/discovery/discovery_search', 'additionalProperties': False, 'description': 'Groups search fields by section.', 'properties': {'fields': {'items': {'type': 'string'}, 'type': 'array'}, 'section_title': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'}}, 'type': 'object'}, formats=None)]),
),
]
8 changes: 7 additions & 1 deletion chord_metadata_service/chord/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from chord_metadata_service.patients.models import Individual
from chord_metadata_service.phenopackets.models import Biosample, Phenopacket
from chord_metadata_service.resources.models import Resource
from ..restapi.models import SchemaType
from chord_metadata_service.restapi.validators import JsonSchemaValidator
from chord_metadata_service.restapi.models import SchemaType
from chord_metadata_service.discovery.schemas import DISCOVERY_SCHEMA


__all__ = ["Project", "Dataset", "ProjectJsonSchema"]
Expand Down Expand Up @@ -34,6 +36,8 @@ class Project(models.Model):

created = models.DateTimeField(auto_now_add=True)
updated = models.DateTimeField(auto_now=True)
discovery = models.JSONField(blank=True, null=True, help_text="Discovery configuration",
validators=[JsonSchemaValidator(DISCOVERY_SCHEMA)])

def __str__(self):
return f"{self.title} (ID: {self.identifier})"
Expand Down Expand Up @@ -149,6 +153,8 @@ def resources(self):
extra_properties = models.JSONField(blank=True, null=True,
help_text="Extra properties that do not fit in the previous "
"specified attributes.")
discovery = models.JSONField(blank=True, null=True, help_text="Discovery configuration",
validators=[JsonSchemaValidator(DISCOVERY_SCHEMA)])

# -------------------------------------------------------------------------

Expand Down
6 changes: 6 additions & 0 deletions chord_metadata_service/chord/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class DatasetSerializer(GenericSerializer):
"linked_field_sets",
"dats_file",
"project",
"discovery",
)

# noinspection PyMethodMayBeStatic
Expand Down Expand Up @@ -146,6 +147,11 @@ class Meta:

class ProjectSerializer(serializers.ModelSerializer):
# Don't inherit GenericSerializer to not pop empty fields
always_include = (
"title",
"description",
"discovery",
)

datasets = DatasetSerializer(read_only=True, many=True, exclude_when_nested=["project"])
project_schemas = ProjectJsonSchemaSerializer(read_only=True, many=True)
Expand Down
16 changes: 12 additions & 4 deletions chord_metadata_service/chord/views_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,20 @@
OUTPUT_FORMAT_BENTO_SEARCH_RESULT = "bento_search_result"


async def experiment_dataset_summary(_request: DrfRequest, dataset):
return await dt_experiment_summary(Experiment.objects.filter(dataset=dataset), low_counts_censored=False)
async def experiment_dataset_summary(request: DrfRequest, dataset: Dataset):
return await dt_experiment_summary(
Experiment.objects.filter(dataset=dataset),
discovery=None,
low_counts_censored=False
)


async def phenopacket_dataset_summary(_request: DrfRequest, dataset: Dataset):
return await dt_phenopacket_summary(Phenopacket.objects.filter(dataset=dataset), low_counts_censored=False)
async def phenopacket_dataset_summary(request: DrfRequest, dataset: Dataset):
return await dt_phenopacket_summary(
Phenopacket.objects.filter(dataset=dataset),
discovery=None,
low_counts_censored=False
)


# TODO: CHORD-standardized logging
Expand Down
97 changes: 67 additions & 30 deletions chord_metadata_service/discovery/api_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,18 @@
from rest_framework.request import Request as DrfRequest
from rest_framework.response import Response

from chord_metadata_service.discovery.censorship import RULES_NO_PERMISSIONS
from chord_metadata_service.discovery.exceptions import DiscoveryConfigException
from chord_metadata_service.discovery.utils import get_request_discovery

from . import responses as dres
from .types import BinWithValue
from ..chord import models as cm
from ..logger import logger

from .fields import get_field_options, get_range_stats, get_categorical_stats, get_date_stats
from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL
from .model_lookups import PUBLIC_MODEL_NAMES_TO_MODEL, PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS
from .schemas import DISCOVERY_SCHEMA


@extend_schema(
Expand All @@ -33,31 +38,30 @@
)
@api_view(["GET"])
@permission_classes([AllowAny])
async def public_search_fields(_request: DrfRequest):
async def public_search_fields(request: DrfRequest):
"""
get:
Return public search fields with their configuration
"""

# TODO: should be project-scoped

config_public = settings.CONFIG_PUBLIC
try:
discovery = await get_request_discovery(request)
except DiscoveryConfigException as e:
return Response(e.message, status=status.HTTP_404_NOT_FOUND)

if not config_public:
if not discovery:
return Response(dres.NO_PUBLIC_FIELDS_CONFIGURED, status=status.HTTP_404_NOT_FOUND)

field_conf = config_public["fields"]

# Note: the array is wrapped in a dictionary structure to help with JSON
# processing by some services.

async def _get_field_response(field) -> dict | None:
field_props = field_conf[field]
field_props = discovery.get("fields", {}).get(field, {})

return {
**field_props,
"id": field,
"options": await get_field_options(field_props, low_counts_censored=True),
"options": await get_field_options(field, discovery=discovery, low_counts_censored=True),
}

async def _get_section_response(section) -> dict:
Expand All @@ -67,7 +71,7 @@ async def _get_section_response(section) -> dict:
}

return Response({
"sections": await asyncio.gather(*map(_get_section_response, config_public["search"])),
"sections": await asyncio.gather(*map(_get_section_response, discovery["search"])),
})


Expand All @@ -90,25 +94,43 @@ async def _counts_for_model_name(mn: str) -> tuple[str, int]:
)
@api_view(["GET"]) # Don't use BentoAllowAny, we want to be more careful of cases here.
@permission_classes([AllowAny])
async def public_overview(_request: DrfRequest):
async def public_overview(request: DrfRequest):
"""
get:
Overview of all public data in the database
"""

config_public = settings.CONFIG_PUBLIC
try:
discovery = await get_request_discovery(request)
except DiscoveryConfigException as e:
return Response(e.message, status=status.HTTP_404_NOT_FOUND)
dataset_id = request.query_params.get("dataset", None)
project_id = request.query_params.get("project", None)

if not config_public:
if not discovery:
return Response(dres.NO_PUBLIC_DATA_AVAILABLE, status=status.HTTP_404_NOT_FOUND)

# TODO: public overviews SHOULD be project-scoped at least.
async def _counts_for_scoped_model_name(mn: str) -> tuple[str, int]:
if dataset_id:
scope = "dataset"
value = dataset_id
elif project_id and not dataset_id:
scope = "project"
value = project_id
elif not project_id and not dataset_id:
return await _counts_for_model_name(mn)
filter_query = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][scope]["filter"]
prefetch = PUBLIC_MODEL_NAMES_TO_SCOPE_FILTERS[mn][scope]["prefetch_related"]
return mn, await PUBLIC_MODEL_NAMES_TO_MODEL[mn].objects.prefetch_related(*prefetch).filter(
**{filter_query: value}
).acount()

# Predefined counts
counts = dict(await asyncio.gather(*map(_counts_for_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))
counts = dict(await asyncio.gather(*map(_counts_for_scoped_model_name, PUBLIC_MODEL_NAMES_TO_MODEL)))

# Get the rules config - because we used get_config_public_and_field_set_permissions with no arguments, it'll choose
# these values based on if we have access to ALL public fields or not.
rules_config = config_public["rules"]
rules_config = discovery["rules"]
count_threshold = rules_config["count_threshold"]

# Set counts to 0 if they're under the count threshold, and we don't have full data access permissions for the
Expand All @@ -119,42 +141,40 @@ async def public_overview(_request: DrfRequest):
counts[public_model_name] = 0

response = {
"layout": config_public["overview"],
"layout": discovery["overview"],
"fields": {},
"counts": {
"individuals": counts["individual"],
"biosamples": counts["biosample"],
"experiments": counts["experiment"],
},
# TODO: remove these in favour of public_rules endpoint
"max_query_parameters": rules_config["max_query_parameters"],
"count_threshold": count_threshold,
}

# Parse the public config to gather data for each field defined in the overview

fields = [chart["field"] for section in config_public["overview"] for chart in section["charts"]]
field_conf = config_public["fields"]
fields = [chart["field"] for section in discovery["overview"] for chart in section["charts"]]
field_conf = discovery["fields"]

async def _get_field_response(field_id: str, field_props: dict) -> dict:
async def _get_field_response(field: str) -> dict:
field_props = field_conf.get(field, {"datatype": None})
stats: list[BinWithValue] | None
if field_props["datatype"] == "string":
stats = await get_categorical_stats(field_props, low_counts_censored=True)
stats = await get_categorical_stats(field, discovery, project_id, dataset_id, low_counts_censored=True)
elif field_props["datatype"] == "number":
stats = await get_range_stats(field_props, low_counts_censored=True)
stats = await get_range_stats(field, discovery, project_id, dataset_id, low_counts_censored=True)
elif field_props["datatype"] == "date":
stats = await get_date_stats(field_props, low_counts_censored=True)
stats = await get_date_stats(field, discovery, project_id, dataset_id, low_counts_censored=True)
else:
raise NotImplementedError()

return {
**field_props,
"id": field_id,
"id": field,
**({"data": stats} if stats is not None else {}),
}

# Parallel async collection of field responses for public overview
field_responses = await asyncio.gather(*(_get_field_response(field, field_conf[field]) for field in fields))
field_responses = await asyncio.gather(*(_get_field_response(field) for field in fields))

for field, field_res in zip(fields, field_responses):
response["fields"][field] = field_res
Expand Down Expand Up @@ -184,9 +204,26 @@ async def public_dataset(_request: DrfRequest):
"dimensions", "primary_publications", "citations",
"produced_by", "creators", "licenses",
"acknowledges", "keywords", "version", "dats_file",
"extra_properties", "identifier"
"extra_properties", "identifier", "discovery"
)

return Response({
"datasets": datasets
})


@api_view(["GET"])
@permission_classes([AllowAny])
async def discovery_schema(_request: DrfRequest):
return Response(DISCOVERY_SCHEMA)


@api_view(["GET"])
@permission_classes([AllowAny])
async def public_rules(request: DrfRequest):
try:
discovery = await get_request_discovery(request)
except DiscoveryConfigException as e:
return Response(e.message, status=status.HTTP_404_NOT_FOUND)
rules = discovery["rules"] if discovery and "rules" in discovery else RULES_NO_PERMISSIONS
return Response(rules, status=status.HTTP_200_OK)
18 changes: 9 additions & 9 deletions chord_metadata_service/discovery/censorship.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sys

from django.conf import settings
from chord_metadata_service.discovery.types import DiscoveryConfig

__all__ = [
"RULES_NO_PERMISSIONS",
Expand All @@ -16,27 +16,27 @@
}


def get_threshold(low_counts_censored: bool) -> int:
def get_threshold(discovery: DiscoveryConfig, low_counts_censored: bool) -> int:
"""
Gets the maximum count threshold for hiding censored data (i.e., rounding to 0).
"""
if not low_counts_censored:
return 0
if not settings.CONFIG_PUBLIC:
if not discovery:
return RULES_NO_PERMISSIONS["count_threshold"]
return settings.CONFIG_PUBLIC["rules"]["count_threshold"]
return discovery["rules"]["count_threshold"]


def thresholded_count(c: int, low_counts_censored: bool) -> int:
return 0 if c <= get_threshold(low_counts_censored) else c
def thresholded_count(c: int, discovery: DiscoveryConfig, low_counts_censored: bool) -> int:
return 0 if c <= get_threshold(discovery, low_counts_censored) else c


def get_max_query_parameters(low_counts_censored: bool) -> int:
def get_max_query_parameters(discovery: DiscoveryConfig, low_counts_censored: bool) -> int:
"""
Gets the maximum number of query parameters allowed for censored discovery.
"""
if not low_counts_censored:
return sys.maxsize
if not settings.CONFIG_PUBLIC:
if not discovery:
return RULES_NO_PERMISSIONS["max_query_parameters"]
return settings.CONFIG_PUBLIC["rules"]["max_query_parameters"]
return discovery["rules"]["max_query_parameters"]
21 changes: 21 additions & 0 deletions chord_metadata_service/discovery/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
__all__ = [
"DiscoveryConfigException"
]


class DiscoveryConfigException(Exception):

def __init__(self, dataset_id: str | None = None, project_id: str | None = None, *args: object) -> None:
self.dataset_id = dataset_id
self.project_id = project_id

message = "Error retrieving {0} scoped discovery config: {0} {1} does not exist."
if dataset_id and project_id:
message = message.format("project-dataset", f"({project_id}, {dataset_id}) pair")
elif dataset_id:
message = message.format("dataset", dataset_id)
elif project_id:
message = message.format("project", project_id)
self.message = {"message": message}

super().__init__(*args)
Loading

0 comments on commit 5848709

Please sign in to comment.