Skip to content

Commit

Permalink
Added Data Quality Metrics aspect to emit data quality metrics metada…
Browse files Browse the repository at this point in the history
…ta into Datahub
  • Loading branch information
naresh-angala committed Oct 25, 2024
1 parent e569fbc commit c95e365
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 0 deletions.
25 changes: 25 additions & 0 deletions metadata-models/src/main/pegasus/com/linkedin/DataQuality.pdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
namespace com.linkedin.dataforge.dataquality

import com.linkedin.common.ChangeAuditStamps

/**
* Data Quality aspect. This aspect will be attached with the Dataset entity
*/
@Aspect = {
"name": "dataQuality"
}
record DataQuality includes ChangeAuditStamps {

/**
* datasetDimensionInfo is a type of DataQualityDimensionInfo
* optional
*/
datasetDimensionInfo: optional DataQualityDimensionInfo

/**
* schemaFieldDimensionInfo is an array of type SchemaFieldQualityDimensionInfo
* optional
*/
schemaFieldDimensionInfos: optional array[SchemaFieldQualityDimensionInfo]

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
namespace com.linkedin.dataforge.dataquality

/**
* Record to capture Dataset/Field level Data Quality Metrics
*/
record DataQualityDimensionInfo {

/**
* dimensions attribute to capture dimensions. It is an array of dimensionScore
* Required if this record is initialized
*/
dimensions: array[DimensionScore]

/**
* tool_name attribute to capture the tool that has been used to generate the quality metrics.
* Optional field
*/
toolName: optional string

/**
* record_count attribute to capture the number of data records that have been used to capture the quality metrics
* Optional field
*/
recordCount: optional long

/**
* note attribute to capture comment or any other information about the result
* Optional field
*/
note: optional string

}
46 changes: 46 additions & 0 deletions metadata-models/src/main/pegasus/com/linkedin/DimensionScore.pdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
namespace com.linkedin.dataforge.dataquality

import com.linkedin.common.Urn

/**
* Record to capture Dimension Score information
*/
record DimensionScore {
/**
* dimensionName attribute to capture predefined dimension names.
* Required field if this record is initialized
*/
@Relationship = {
"name": "dimensionNameType",
"entityTypes": [ "dimensionNameType" ]
}
dimensionUrn: Urn

/**
* currentScore attribute to capture current score for a specific dimension
* Required field if this record is initialized
*/
currentScore: string

/**
* historicalWeightedScore attribute to capture historical weighted score for a specific dimension
* Optional field. Value will be populated, if available
*/
historicalWeightedScore: optional string

/**
* scoreType attribute to capture score type for a specific score
* Required field if this record is initialized
*/
scoreType: enum ScoreType {
PERCENTAGE
NUMERICAL_VALUE
}

/**
* note attribute to capture note for a specific dimension
* Optional field
*/
note: optional string

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
namespace com.linkedin.dataforge.dataquality

import com.linkedin.common.AuditStamp

/**
* Information about an Dimension type
*/
@Aspect = {
"name": "dimensionTypeInfo"
}
record DimensionTypeInfo {

/**
* Display name of the Dimension Type
*/
@Searchable = {
"fieldType": "WORD_GRAM",
"enableAutocomplete": true,
"boostScore": 10.0
}
name: string

/**
* Description of the Dimension Type
*/
description: optional string

/**
* Audit stamp capturing the time and actor who created the Dimension Type.
*/
@Searchable = {
"/time": {
"fieldType": "DATETIME",
"fieldName": "createdAt"
},
"/actor": {
"fieldType": "URN",
"fieldName": "createdBy"
}
}
created: AuditStamp

/**
* Audit stamp capturing the time and actor who last modified the Dimension Type.
*/
@Searchable = {
"/time": {
"fieldType": "DATETIME",
"fieldName": "lastModifiedAt"
}
"/actor": {
"fieldType": "URN",
"fieldName": "lastModifiedBy"
}
}
lastModified: AuditStamp
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
namespace com.linkedin.dataforge.dataquality

import com.linkedin.common.ChangeAuditStamps
import com.linkedin.common.Urn

/**
* Record to capture Dataset/Field level Data Quality Metrics
*/
record SchemaFieldQualityDimensionInfo includes ChangeAuditStamps {
/**
* schemaFieldURN attribute to capture schema field URN
* Required field, if this record is initialized
*/
@Relationship = {
"name": "IsAssociatedWith",
"entityTypes": [ "schemaField" ]
}
schemaFieldURN: Urn

/**
* schemaFieldDimensionInfo attribute to capture schema field dimension
* Required field, if this record is initialized
*/
schemaFieldDimensionInfo: DataQualityDimensionInfo
}
1 change: 1 addition & 0 deletions metadata-models/src/main/resources/entity-registry.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ entities:
- structuredProperties
- forms
- partitionsSummary
- dataQuality
- name: dataHubPolicy
doc: DataHub Policies represent access policies granted to users or groups on metadata operations like edit, view etc.
category: internal
Expand Down

0 comments on commit c95e365

Please sign in to comment.