From 7f4c9cade86d0af10983f5ed20a611695bfeddb1 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Mon, 8 Nov 2021 14:05:48 -0800 Subject: [PATCH 1/5] Add source authentication --- docs/how-to/source-authentication.md | 42 +++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/how-to/source-authentication.md b/docs/how-to/source-authentication.md index 6d7eac4..0a441f3 100644 --- a/docs/how-to/source-authentication.md +++ b/docs/how-to/source-authentication.md @@ -20,12 +20,33 @@ See [Variable](../concepts/variables.md) for more details about variables. Generally, when the source is dynamic(as above), it is recommended to start with a static value. After testing with the static URI, variables can be devised to make the URI dynamic. +Please follow [Authentication Methods](../concepts/authentication-method.md) for authentication configuration details. + ## HTTP Syntax `ms.source.uri=https://host-name/path?url-parameters` For HTTP connections, `ms.source.uri` accepts a domain or host name, optional path segments, and optional URL -parameters. All of them can be dynamic, i.e., they can contain DIL variables enclosed with double brackets `{{` and `}}`. +parameters. All of them can be dynamic, i.e., they can contain DIL variables enclosed with double brackets `{{` and `}}`. + +For basic authentication, use the following: + - `source.conn.username` + - `source.conn.password` + - `ms.authentication` + +For token based authentication: + - `ms.authentication` + +**Note**: Basic authentication can also be configured as token authentication by concatenating username and password, separated +by a column. + +For OAuth2.0 authentication: + - `ms.authentication` + - `ms.secondary.input` + +For form based authentication: + - `ms.parameters` + - `ms.http.request.headers={"Content-Type": "application/x-www-form-urlencoded"}` ## S3 Syntax @@ -34,6 +55,10 @@ parameters. All of them can be dynamic, i.e., they can contain DIL variables enc S3 syntax is similar like HTTP syntax, except the `ms.source.uri` has the bucket name as part of host name, and instead of URL path, it should have optionally a prefix string. +For authentication, use the following: +- `source.conn.username=access-key` +- `source.conn.password=secrete-id` + ## JDBC Syntax `ms.source.uri=jdbc:database-type://host-name:port/database-name?configurations` @@ -42,6 +67,10 @@ The database type can be `mysql` or `sqlserver`. Configurations are name value pairs, separated by `&` such as `useSSL=true&enabledTLSProtocols=TLSv1.2`. +For authentication, use the following: +- `source.conn.username` +- `source.conn.password` + ## SFTP Syntax `ms.source.uri=path` @@ -49,5 +78,16 @@ Configurations are name value pairs, separated by `&` such as `useSSL=true&enabl For SFTP, the host name is specified in `source.conn.host`, and the root path is specified in `ms.source.uri`. +For authentication, use the following: +- `source.conn.username` +- `source.conn.password` + +or use the following if private key authentication is required: +- `source.conn.private.key` + +## Variables + +To make any part of the source URI dynamic, add [variables](../concepts/variables.md) as needed. In runtime, +variables will be replaced with actual values, hence source URI can get different values in different work units. [Back to Summary](summary.md#config-source-and-authentication) \ No newline at end of file From 1b839eef14b75027055a9439d30787219f19e65b Mon Sep 17 00:00:00 2001 From: Chris Li Date: Mon, 8 Nov 2021 14:53:09 -0800 Subject: [PATCH 2/5] Add job.commit.policy to monitored parameter list --- .../linkedin/cdi/configuration/PropertyCollection.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java b/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java index 165e69e..a8f50e7 100644 --- a/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java +++ b/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java @@ -292,6 +292,9 @@ protected Integer getValidNonblankWithDefault(State state) { BooleanProperties MSTAGE_WORK_UNIT_PARTIAL_PARTITION = new BooleanProperties("ms.work.unit.partial.partition", Boolean.TRUE); StringProperties MSTAGE_WORK_UNIT_PARTITION = new StringProperties("ms.work.unit.partition", "none"); + StringProperties CONVERTER_AVRO_DATE_FORMAT = new StringProperties("converter.avro.date.format"); + StringProperties CONVERTER_AVRO_TIME_FORMAT = new StringProperties("converter.avro.time.format"); + StringProperties CONVERTER_AVRO_TIMESTAMP_FORMAT = new StringProperties("converter.avro.timestamp.format"); StringProperties CONVERTER_CLASSES = new StringProperties("converter.classes"); StringProperties DATA_PUBLISHER_FINAL_DIR = new StringProperties("data.publisher.final.dir"); StringProperties DATASET_URN = new StringProperties("dataset.urn"); @@ -321,6 +324,7 @@ protected String getValidNonblankWithDefault(State state) { } }; + StringProperties JOB_COMMIT_POLICY = new StringProperties("job.commit.policy"); StringProperties JOB_DIR = new StringProperties("job.dir"); StringProperties JOB_NAME = new StringProperties("job.name"); StringProperties SOURCE_CLASS = new StringProperties("source.class"); @@ -405,6 +409,9 @@ protected String getValidNonblankWithDefault(State state) { MSTAGE_WORK_UNIT_PARALLELISM_MAX, MSTAGE_WORK_UNIT_PARTIAL_PARTITION, MSTAGE_WORK_UNIT_PARTITION, + CONVERTER_AVRO_DATE_FORMAT, + CONVERTER_AVRO_TIME_FORMAT, + CONVERTER_AVRO_TIMESTAMP_FORMAT, CONVERTER_CLASSES, DATA_PUBLISHER_FINAL_DIR, DATASET_URN, @@ -414,6 +421,7 @@ protected String getValidNonblankWithDefault(State state) { EXTRACT_NAMESPACE, EXTRACT_TABLE_NAME, EXTRACT_TABLE_TYPE, + JOB_COMMIT_POLICY, JOB_DIR, JOB_NAME, SOURCE_CLASS, From b30deae416305aaae03d2015818fc17039d5c297 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Mon, 8 Nov 2021 14:55:57 -0800 Subject: [PATCH 3/5] Add dataset.name to deprecated property list --- .../java/com/linkedin/cdi/configuration/PropertyCollection.java | 1 + 1 file changed, 1 insertion(+) diff --git a/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java b/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java index a8f50e7..7918a5d 100644 --- a/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java +++ b/cdi-core/src/main/java/com/linkedin/cdi/configuration/PropertyCollection.java @@ -441,6 +441,7 @@ protected String getValidNonblankWithDefault(State state) { ); Map> deprecatedProperties = new ImmutableMap.Builder>() + .put("dataset.name", EXTRACT_TABLE_NAME) .put("ms.csv.column.header", MSTAGE_CSV) .put("ms.csv.column.header.index", MSTAGE_CSV) .put("ms.csv.column.projection", MSTAGE_CSV) From ac770b936db66a38bedf7fcca9a10d3570580953 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Mon, 8 Nov 2021 15:56:38 -0800 Subject: [PATCH 4/5] Add job.commit.policy description --- docs/parameters/converter.avro.date.format.md | 0 .../converter.avro.datetime.format.md | 0 docs/parameters/converter.avro.time.format.md | 0 docs/parameters/job.commit.policy.md | 21 +++++++++++++++++++ docs/parameters/summary.md | 4 ++++ 5 files changed, 25 insertions(+) create mode 100644 docs/parameters/converter.avro.date.format.md create mode 100644 docs/parameters/converter.avro.datetime.format.md create mode 100644 docs/parameters/converter.avro.time.format.md create mode 100644 docs/parameters/job.commit.policy.md diff --git a/docs/parameters/converter.avro.date.format.md b/docs/parameters/converter.avro.date.format.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/parameters/converter.avro.datetime.format.md b/docs/parameters/converter.avro.datetime.format.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/parameters/converter.avro.time.format.md b/docs/parameters/converter.avro.time.format.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/parameters/job.commit.policy.md b/docs/parameters/job.commit.policy.md new file mode 100644 index 0000000..e1f662c --- /dev/null +++ b/docs/parameters/job.commit.policy.md @@ -0,0 +1,21 @@ +# job.commit.policy + +**Tags**: +[gobblin](categories.md#gobblin-properties) + +**Type**: string + +**Default value**: full + +**Related**: + +## Description + +`job.commit.policy` specifies how to job state will be committed when some of its tasks failed. Valid values are +"full", and "successful". + +- full: Commit output data of a job if and only if all of its tasks successfully complete. +- partial: Deprecated, the replacement is "SUCCESSFUL" +- successful: Commit output data of tasks that successfully complete. + +[back to summary](summary.md#essential-gobblin-core-properties) diff --git a/docs/parameters/summary.md b/docs/parameters/summary.md index 1731fc9..53052a8 100644 --- a/docs/parameters/summary.md +++ b/docs/parameters/summary.md @@ -467,6 +467,10 @@ for a complete list of Gobblin core properties, please refer to Gobblin document is a required parameter if the extractor is anything other than the FileDumpExtractor. Writers and some converters don't work without it. +## [job.commmit.policy](job.commit.policy.md) + +`job.commit.policy` specifies how to job state will be committed when some of its tasks failed. Valid values are +"full", and "successful". ## [source.class](source.class.md) ## [converter.class](converter.class.md) From 8efbb01c08509d48660f8984c78bb69024390015 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Mon, 8 Nov 2021 16:14:30 -0800 Subject: [PATCH 5/5] Add converter avro date time format properties descritpion --- docs/parameters/categories.md | 4 ++++ docs/parameters/converter.avro.date.format.md | 22 +++++++++++++++++++ .../converter.avro.datetime.format.md | 0 docs/parameters/converter.avro.time.format.md | 22 +++++++++++++++++++ .../converter.avro.timestamp.format.md | 22 +++++++++++++++++++ docs/parameters/job.commit.policy.md | 6 ++--- docs/parameters/summary.md | 19 ++++++++++++++-- 7 files changed, 89 insertions(+), 6 deletions(-) delete mode 100644 docs/parameters/converter.avro.datetime.format.md create mode 100644 docs/parameters/converter.avro.timestamp.format.md diff --git a/docs/parameters/categories.md b/docs/parameters/categories.md index c2aef29..63473ec 100644 --- a/docs/parameters/categories.md +++ b/docs/parameters/categories.md @@ -131,6 +131,10 @@ The following are related to watermarks and work units: The following properties are inherited from Gobblin and enhanced with explicitly validation rules. +- [converter.avro.date.format](converter.avro.date.format.md) +- [converter.avro.time.format](converter.avro.time.format.md) +- [converter.avro.timestamp.format](converter.avro.timestamp.format.md) - [extract.table.name](extract.table.name.md) +- [job.commit.policy](job.commit.policy.md) - [source.class](source.class.md) - [converter.class](converter.class.md) \ No newline at end of file diff --git a/docs/parameters/converter.avro.date.format.md b/docs/parameters/converter.avro.date.format.md index e69de29..382a2d6 100644 --- a/docs/parameters/converter.avro.date.format.md +++ b/docs/parameters/converter.avro.date.format.md @@ -0,0 +1,22 @@ +# converter.avro.date.format.md + +**Tags**: +[gobblin](categories.md#gobblin-properties) + +**Type**: string + +**Default value**: none + +**Related**: + +## Description + +`converter.avro.date.format` indicates how date values are formatted in the user data. This property +is used by the JSON to AVRO converter in converting fields of type "date". + +This property accepts multiple formats, separated by comma (,), if date values come in with several forms. + +For example: +- `converter.avro.date.format=MM/dd/yyyy HH:mm,dd-MMM-yyyy HH:mm:ss` + +[back to summary](summary.md#essential-gobblin-core-properties) \ No newline at end of file diff --git a/docs/parameters/converter.avro.datetime.format.md b/docs/parameters/converter.avro.datetime.format.md deleted file mode 100644 index e69de29..0000000 diff --git a/docs/parameters/converter.avro.time.format.md b/docs/parameters/converter.avro.time.format.md index e69de29..bbf69cd 100644 --- a/docs/parameters/converter.avro.time.format.md +++ b/docs/parameters/converter.avro.time.format.md @@ -0,0 +1,22 @@ +# converter.avro.time.format.md + +**Tags**: +[gobblin](categories.md#gobblin-properties) + +**Type**: string + +**Default value**: none + +**Related**: + +## Description + +`converter.avro.time.format` indicates how time values are formatted in the user data. This property +is used by the JSON to AVRO converter in converting fields of type "time". + +This property accepts multiple formats, separated by comma (,), if time values come in with several forms. + +For example: +- `converter.avro.time.format=HH:mm:ss,HH:mm:ss.000'Z'` + +[back to summary](summary.md#essential-gobblin-core-properties) \ No newline at end of file diff --git a/docs/parameters/converter.avro.timestamp.format.md b/docs/parameters/converter.avro.timestamp.format.md new file mode 100644 index 0000000..41ce3ef --- /dev/null +++ b/docs/parameters/converter.avro.timestamp.format.md @@ -0,0 +1,22 @@ +# converter.avro.timestamp.format.md + +**Tags**: +[gobblin](categories.md#gobblin-properties) + +**Type**: string + +**Default value**: none + +**Related**: + +## Description + +`converter.avro.timestamp.format` indicates how timestamp values are formatted in the user data. This property +is used by the JSON to AVRO converter in converting fields of type "timestamp". + +This property accepts multiple formats, separated by comma (,), if timestamp values come in with several forms. + +For example: +- `converter.avro.timestamp.format=MM/dd/yyyy HH:mm,dd-MMM-yyyy HH:mm:ss` + +[back to summary](summary.md#essential-gobblin-core-properties) \ No newline at end of file diff --git a/docs/parameters/job.commit.policy.md b/docs/parameters/job.commit.policy.md index e1f662c..d919fb2 100644 --- a/docs/parameters/job.commit.policy.md +++ b/docs/parameters/job.commit.policy.md @@ -11,11 +11,9 @@ ## Description -`job.commit.policy` specifies how to job state will be committed when some of its tasks failed. Valid values are -"full", and "successful". - +`job.commit.policy` specifies how the job state will be committed when some of its tasks failed. Valid values are: - full: Commit output data of a job if and only if all of its tasks successfully complete. -- partial: Deprecated, the replacement is "SUCCESSFUL" - successful: Commit output data of tasks that successfully complete. +- partial: Deprecated, the replacement is "successful" [back to summary](summary.md#essential-gobblin-core-properties) diff --git a/docs/parameters/summary.md b/docs/parameters/summary.md index 53052a8..9c56210 100644 --- a/docs/parameters/summary.md +++ b/docs/parameters/summary.md @@ -461,6 +461,21 @@ a work unit. Partitioning, therefore, allows parallel processing. The following are Gobblin core properties that are essential to job configuration. This is only a short list, for a complete list of Gobblin core properties, please refer to Gobblin documentation. +## [converter.avro.date.format](converter.avro.date.format.md) + +`converter.avro.date.format` indicates how date values are formatted in the user data. This property +is used by the JSON to AVRO converter in converting fields of type "date". + +## [converter.avro.time.format](converter.avro.time.format.md) + +`converter.avro.time.format` indicates how time values are formatted in the user data. This property +is used by the JSON to AVRO converter in converting fields of type "time". + +## [converter.avro.timestamp.format](converter.avro.timestamp.format.md) + +`converter.avro.timestamp.format` indicates how timestamp values are formatted in the user data. This property +is used by the JSON to AVRO converter in converting fields of type "timestamp". + ## [extract.table.name](extract.table.name.md) `extract.table.name` specifies the target table name, not the source table name. This @@ -469,8 +484,8 @@ Writers and some converters don't work without it. ## [job.commmit.policy](job.commit.policy.md) -`job.commit.policy` specifies how to job state will be committed when some of its tasks failed. Valid values are -"full", and "successful". +`job.commit.policy` specifies how the job state will be committed when some of its tasks failed. Valid values are +"full" or "successful". ## [source.class](source.class.md) ## [converter.class](converter.class.md)