From 79d11ea709786a7b51f12723388d17e77d5f6e0f Mon Sep 17 00:00:00 2001 From: DDDDD12138 <43703884+DDDDD12138@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:08:06 +0800 Subject: [PATCH] feat: add parameters for JinaReaderTool (#11613) --- .../builtin/jina/tools/jina_reader.py | 13 +++++ .../builtin/jina/tools/jina_reader.yaml | 55 +++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/api/core/tools/provider/builtin/jina/tools/jina_reader.py b/api/core/tools/provider/builtin/jina/tools/jina_reader.py index 0dd55c65291783..756b7272248146 100644 --- a/api/core/tools/provider/builtin/jina/tools/jina_reader.py +++ b/api/core/tools/provider/builtin/jina/tools/jina_reader.py @@ -43,6 +43,13 @@ def _invoke( if wait_for_selector is not None and wait_for_selector != "": headers["X-Wait-For-Selector"] = wait_for_selector + remove_selector = tool_parameters.get("remove_selector") + if remove_selector is not None and remove_selector != "": + headers["X-Remove-Selector"] = remove_selector + + if tool_parameters.get("retain_images", False): + headers["X-Retain-Images"] = "true" + if tool_parameters.get("image_caption", False): headers["X-With-Generated-Alt"] = "true" @@ -59,6 +66,12 @@ def _invoke( if tool_parameters.get("no_cache", False): headers["X-No-Cache"] = "true" + if tool_parameters.get("with_iframe", False): + headers["X-With-Iframe"] = "true" + + if tool_parameters.get("with_shadow_dom", False): + headers["X-With-Shadow-Dom"] = "true" + max_retries = tool_parameters.get("max_retries", 3) response = ssrf_proxy.get( str(URL(self._jina_reader_endpoint + url)), diff --git a/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml b/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml index 589bc3433d9478..012a8c7688cb57 100644 --- a/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml +++ b/api/core/tools/provider/builtin/jina/tools/jina_reader.yaml @@ -67,6 +67,33 @@ parameters: pt_BR: css selector para aguardar elementos específicos llm_description: css selector of the target element to wait for form: form + - name: remove_selector + type: string + required: false + label: + en_US: Excluded Selector + zh_Hans: 排除选择器 + pt_BR: Seletor Excluído + human_description: + en_US: css selector for remove for specific elements + zh_Hans: css 选择器用于排除特定元素 + pt_BR: seletor CSS para remover elementos específicos + llm_description: css selector of the target element to remove for + form: form + - name: retain_images + type: boolean + required: false + default: false + label: + en_US: Remove All Images + zh_Hans: 删除所有图片 + pt_BR: Remover todas as imagens + human_description: + en_US: Removes all images from the response. + zh_Hans: 从响应中删除所有图片。 + pt_BR: Remove todas as imagens da resposta. + llm_description: Remove all images + form: form - name: image_caption type: boolean required: false @@ -136,6 +163,34 @@ parameters: pt_BR: Ignorar o cache llm_description: bypass the cache form: form + - name: with_iframe + type: boolean + required: false + default: false + label: + en_US: Enable iframe extraction + zh_Hans: 启用 iframe 提取 + pt_BR: Habilitar extração de iframe + human_description: + en_US: Extract and process content of all embedded iframes in the DOM tree. + zh_Hans: 提取并处理 DOM 树中所有嵌入 iframe 的内容。 + pt_BR: Extrair e processar o conteúdo de todos os iframes incorporados na árvore DOM. + llm_description: Extract content from embedded iframes + form: form + - name: with_shadow_dom + type: boolean + required: false + default: false + label: + en_US: Enable Shadow DOM extraction + zh_Hans: 启用 Shadow DOM 提取 + pt_BR: Habilitar extração de Shadow DOM + human_description: + en_US: Traverse all Shadow DOM roots in the document and extract content. + zh_Hans: 遍历文档中所有 Shadow DOM 根并提取内容。 + pt_BR: Percorra todas as raízes do Shadow DOM no documento e extraia o conteúdo. + llm_description: Extract content from Shadow DOM roots + form: form - name: summary type: boolean required: false