From 66ec7a0f457a163a0a552d6f1b9447b19af40d93 Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 22 Oct 2024 07:19:19 -0400 Subject: [PATCH 1/2] TIKA-4330 -- Add a MetadataListFilter --- .../java/org/apache/tika/cli/TikaCLI.java | 4 +- .../java/org/apache/tika/gui/TikaGUI.java | 14 +++-- .../org/apache/tika/config/TikaConfig.java | 11 ++++ .../CompositeMetadataListFilter.java | 58 +++++++++++++++++++ .../listfilter/MetadataListFilter.java | 52 +++++++++++++++++ .../metadata/listfilter/NoOpListFilter.java | 28 +++++++++ .../org/apache/tika/pipes/PipesServer.java | 39 ++++++++++--- .../AttachmentCountingListFilter.java | 33 +++++++++++ .../apache/tika/pipes/PipesClientTest.java | 17 ++++++ .../test-documents/mock/embedded.xml | 53 +++++++++++++++++ .../resource/RecursiveMetadataResource.java | 6 +- 11 files changed, 300 insertions(+), 15 deletions(-) create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java create mode 100644 tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java create mode 100644 tika-core/src/test/resources/test-documents/mock/embedded.xml diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 4aa5361bae..aa087910a6 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -499,7 +499,9 @@ private void handleRecursiveJson(URL url, OutputStream output) throws IOExceptio } JsonMetadataList.setPrettyPrinting(prettyPrint); try (Writer writer = getOutputWriter(output, encoding)) { - JsonMetadataList.toJson(handler.getMetadataList(), writer); + List metadataList = handler.getMetadataList(); + metadataList = config.getMetadataListFilter().filter(metadataList); + JsonMetadataList.toJson(metadataList, writer); } } diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 41ed93232e..d314b472cb 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -38,6 +38,7 @@ import java.nio.file.Files; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; import javax.swing.Box; @@ -152,9 +153,11 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener * File chooser. */ private final JFileChooser chooser = new JFileChooser(); + private final TikaConfig tikaConfig; - public TikaGUI(Parser parser) { + public TikaGUI(Parser parser, TikaConfig tikaConfig) { super("Apache Tika"); + this.tikaConfig = tikaConfig; setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); addMenuBar(); @@ -198,8 +201,9 @@ public static void main(String[] args) throws Exception { UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); final TikaConfig finalConfig = config; SwingUtilities.invokeLater(() -> new TikaGUI( - new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256), - false)).setVisible(true)); + new DigestingParser(new AutoDetectParser(finalConfig), + new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256), + false), finalConfig).setVisible(true)); } private void addMenuBar() { @@ -374,7 +378,9 @@ private void handleStream(InputStream input, Metadata md) throws Exception { wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); - JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer); + List metadataList = recursiveParserWrapperHandler.getMetadataList(); + metadataList = tikaConfig.getMetadataListFilter().filter(metadataList); + JsonMetadataList.toJson(metadataList, jsonBuffer); setText(json, jsonBuffer.toString()); } layout.show(cards, "metadata"); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index e68ad10d65..63c72bfef5 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -62,6 +62,8 @@ import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; +import org.apache.tika.metadata.listfilter.MetadataListFilter; +import org.apache.tika.metadata.listfilter.NoOpListFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypeException; @@ -104,6 +106,7 @@ public class TikaConfig { private final EncodingDetector encodingDetector; private final Renderer renderer; private final MetadataFilter metadataFilter; + private final MetadataListFilter metadataListFilter; private final AutoDetectParserConfig autoDetectParserConfig; private static int MAX_JSON_STRING_FIELD_LENGTH = DEFAULT_MAX_JSON_STRING_FIELD_LENGTH; @@ -177,6 +180,7 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException, this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); this.executorService = executorLoader.loadOverall(element, mimeTypes, loader); this.metadataFilter = MetadataFilter.load(element, true); + this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); this.serviceLoader = loader; setMaxJsonStringFieldLength(element); @@ -205,6 +209,7 @@ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = new NoOpFilter(); + this.metadataListFilter = new NoOpListFilter(); this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT; TIMES_INSTANTIATED.incrementAndGet(); } @@ -251,6 +256,7 @@ public TikaConfig() throws TikaException, IOException { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = new NoOpFilter(); + this.metadataListFilter = new NoOpListFilter(); this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT; } else { ServiceLoader tmpServiceLoader = new ServiceLoader(); @@ -278,6 +284,7 @@ public TikaConfig() throws TikaException, IOException { this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader); this.metadataFilter = MetadataFilter.load(element, true); + this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); setMaxJsonStringFieldLength(element); } catch (SAXException e) { @@ -629,6 +636,10 @@ public MetadataFilter getMetadataFilter() { return metadataFilter; } + public MetadataListFilter getMetadataListFilter() { + return metadataListFilter; + } + public AutoDetectParserConfig getAutoDetectParserConfig() { return autoDetectParserConfig; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java new file mode 100644 index 0000000000..cede25bd52 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +public class CompositeMetadataListFilter extends MetadataListFilter { + + //no longer final to allow for no arg initialization during serialization + private List filters; + + public CompositeMetadataListFilter() { + filters = new ArrayList<>(); + } + public CompositeMetadataListFilter(List filters) { + this.filters = filters; + } + + public void setFilters(List filters) { + this.filters.clear(); + this.filters.addAll(filters); + } + + public List getFilters() { + return filters; + } + + @Override + public List filter(List metadataList) throws TikaException { + for (MetadataListFilter filter : filters) { + metadataList = filter.filter(metadataList); + } + return metadataList; + } + + @Override + public String toString() { + return "CompositeMetadataListFilter{" + "filters=" + filters + '}'; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java new file mode 100644 index 0000000000..93021da7c3 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +import org.w3c.dom.Element; + +import org.apache.tika.config.ConfigBase; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.filter.MetadataFilter; + +public abstract class MetadataListFilter extends ConfigBase implements Serializable { + /** + * Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter + * @param root + * @return + * @throws TikaConfigException + * @throws IOException + */ + public static MetadataListFilter load(Element root, boolean allowMissing) throws TikaConfigException, + IOException { + try { + return buildComposite("metadataListFilters", CompositeMetadataListFilter.class, + "metadataListFilter", MetadataFilter.class, root); + } catch (TikaConfigException e) { + if (allowMissing && e.getMessage().contains("could not find metadataListFilters")) { + return new NoOpListFilter(); + } + throw e; + } + } + public abstract List filter(List metadataList) throws TikaException; +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java new file mode 100644 index 0000000000..68654e4f2c --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.util.List; + +import org.apache.tika.metadata.Metadata; + +public class NoOpListFilter extends MetadataListFilter { + @Override + public List filter(List metadataList) { + return metadataList; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index dffb7c9ce2..e339b619fe 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -58,6 +58,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.listfilter.MetadataListFilter; +import org.apache.tika.metadata.listfilter.NoOpListFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DigestingParser; @@ -400,11 +402,8 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD long start = System.currentTimeMillis(); String stack = getContainerStacktrace(t, parseData.getMetadataList()); //we need to apply the metadata filter after we pull out the stacktrace - MetadataFilter filter = t.getParseContext().get(MetadataFilter.class); - if (filter == null) { - filter = tikaConfig.getMetadataFilter(); - } - filterMetadata(filter, parseData.getMetadataList()); + filterMetadata(t, parseData.getMetadataList()); + filterMetadataList(t, parseData); ParseContext parseContext = t.getParseContext(); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); @@ -437,16 +436,35 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD } } - private void filterMetadata(MetadataFilter metadataFilter, List metadataList) { + private void filterMetadata(FetchEmitTuple t, List metadataList) { + MetadataFilter filter = t.getParseContext().get(MetadataFilter.class); + if (filter == null) { + filter = tikaConfig.getMetadataFilter(); + } for (Metadata m : metadataList) { try { - metadataFilter.filter(m); + filter.filter(m); } catch (TikaException e) { LOG.warn("failed to filter metadata", e); } } } + private void filterMetadataList(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { + MetadataListFilter filter = t.getParseContext().get(MetadataListFilter.class); + if (filter == null) { + filter = tikaConfig.getMetadataListFilter(); + } + if (filter instanceof NoOpListFilter) { + return; + } + try { + parseData.filter(filter); + } catch (TikaException e) { + LOG.warn("failed to filter metadata list", e); + } + } + private Fetcher getFetcher(FetchEmitTuple t) { try { return fetcherManager.getFetcher(t.getFetchKey().getFetcherName()); @@ -830,7 +848,8 @@ private void write(STATUS status) { } static class MetadataListAndEmbeddedBytes { - final List metadataList; + + List metadataList; final Optional embeddedDocumentBytesHandler; public MetadataListAndEmbeddedBytes(List metadataList, @@ -843,6 +862,10 @@ public List getMetadataList() { return metadataList; } + public void filter(MetadataListFilter filter) throws TikaException { + metadataList = filter.filter(metadataList); + } + public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() { return embeddedDocumentBytesHandler.get(); } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java new file mode 100644 index 0000000000..daa68c9280 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.util.List; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +public class AttachmentCountingListFilter extends MetadataListFilter { + @Override + public List filter(List metadataList) throws TikaException { + if (metadataList == null || metadataList.isEmpty()) { + return metadataList; + } + metadataList.get(0).set("X-TIKA:attachment_count", Integer.toString(metadataList.size() - 1)); + return metadataList; + } +} diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java index 13b0dc312c..35d52fc4a6 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java @@ -32,6 +32,9 @@ import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.MockUpperCaseFilter; +import org.apache.tika.metadata.listfilter.AttachmentCountingListFilter; +import org.apache.tika.metadata.listfilter.CompositeMetadataListFilter; +import org.apache.tika.metadata.listfilter.MetadataListFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.fetcher.FetchKey; @@ -76,4 +79,18 @@ public void testMetadataFilter() throws IOException, InterruptedException { Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0); Assertions.assertEquals("TESTOVERLAPPINGTEXT.PDF", metadata.get("resourceName")); } + + @Test + public void testMetadataListFilter() throws IOException, InterruptedException { + ParseContext parseContext = new ParseContext(); + MetadataListFilter metadataFilter = new CompositeMetadataListFilter(List.of(new AttachmentCountingListFilter())); + parseContext.set(MetadataListFilter.class, metadataFilter); + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple("mock/embedded.xml", new FetchKey(fetcherName, "mock/embedded.xml"), + new EmitKey(), new Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + Assertions.assertNotNull(pipesResult.getEmitData().getMetadataList()); + Assertions.assertEquals(5, pipesResult.getEmitData().getMetadataList().size()); + Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0); + Assertions.assertEquals(4, Integer.parseInt(metadata.get("X-TIKA:attachment_count"))); + } } diff --git a/tika-core/src/test/resources/test-documents/mock/embedded.xml b/tika-core/src/test/resources/test-documents/mock/embedded.xml new file mode 100644 index 0000000000..c75c2fce6b --- /dev/null +++ b/tika-core/src/test/resources/test-documents/mock/embedded.xml @@ -0,0 +1,53 @@ + + + + + + + Nikolai Lobachevsky + main_content + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + \ No newline at end of file diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index a180ddfba1..8b26a672a2 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -19,6 +19,7 @@ import static org.apache.tika.server.core.resource.TikaResource.fillMetadata; import static org.apache.tika.server.core.resource.TikaResource.fillParseContext; +import static org.apache.tika.server.core.resource.TikaResource.getConfig; import java.io.InputStream; import java.util.List; @@ -40,6 +41,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.listfilter.MetadataListFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; @@ -84,8 +86,8 @@ public static List parseMetadata(InputStream is, Metadata metadata, Mu //we shouldn't get here? LOG.error("something went seriously wrong", e); } - - return handler.getMetadataList(); + MetadataListFilter metadataListFilter = context.get(MetadataListFilter.class, getConfig().getMetadataListFilter()); + return metadataListFilter.filter(handler.getMetadataList()); } static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) { From c50d46140ea1194a1b9773ceb1ca2dfdd07858a2 Mon Sep 17 00:00:00 2001 From: Tim Allison Date: Tue, 22 Oct 2024 09:01:16 -0400 Subject: [PATCH 2/2] TIKA-4321 -- rm solr 6 and 7 unit tests and turn logging to error. (#1994) * TIKA-4321 -- rm solr 6 and 7 unit tests and turn logging to error. (cherry picked from commit d9dfe20b806e7e1341fefa6bb84c9361ab37e361) --- .../pipes/solr/tests/TikaPipesSolr6Test.java | 49 ------------------- .../solr/tests/TikaPipesSolr6ZkTest.java | 41 ---------------- .../pipes/solr/tests/TikaPipesSolr7Test.java | 33 ------------- .../solr/tests/TikaPipesSolr7ZkTest.java | 42 ---------------- .../pipes/solr/tests/TikaPipesSolr8Test.java | 2 +- .../pipes/solr/tests/TikaPipesSolr9Test.java | 2 +- .../src/test/resources/log4j2.xml | 4 +- tika-parent/pom.xml | 3 +- .../tika/pipes/emitter/solr/SolrEmitter.java | 19 ++++++- .../pipesiterator/solr/SolrPipesIterator.java | 19 +++++-- 10 files changed, 38 insertions(+), 176 deletions(-) delete mode 100644 tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java delete mode 100644 tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6ZkTest.java delete mode 100644 tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7Test.java delete mode 100644 tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7ZkTest.java diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java deleted file mode 100644 index 3cb6a6d20e..0000000000 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6Test.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.solr.tests; - -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import org.junit.jupiter.api.BeforeAll; -import org.testcontainers.junit.jupiter.Testcontainers; - -import org.apache.tika.utils.SystemUtils; - -@Testcontainers(disabledWithoutDocker = true) -public class TikaPipesSolr6Test extends TikaPipesSolrTestBase { - @BeforeAll - public static void setUp() { - assumeTrue(!SystemUtils.IS_OS_MAC_OSX && !SystemUtils.OS_VERSION.equals("12.6.1"), - "This stopped working on macos x ... TIKA-3932"); - } - @Override - public boolean useZk() { - return false; - } - - @Override - public String getSolrImageName() { - return "solr:6"; - } - - @Override - public boolean handlesParentChild() { - //Solr 6 didn't automatically set _root_ with the parent-child indexing, - //apparently - return false; - } -} diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6ZkTest.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6ZkTest.java deleted file mode 100644 index 221d1e2777..0000000000 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr6ZkTest.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.solr.tests; - -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import org.junit.jupiter.api.BeforeAll; -import org.testcontainers.junit.jupiter.Testcontainers; - -import org.apache.tika.utils.SystemUtils; - -@Testcontainers(disabledWithoutDocker = true) -public class TikaPipesSolr6ZkTest extends TikaPipesSolr6Test { - - @BeforeAll - public static void setUp() { - assumeTrue( - SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX, - "zk test only works on linux (and not mac os x)"); - } - - @Override - public boolean useZk() { - return true; - } - -} diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7Test.java deleted file mode 100644 index efefedf449..0000000000 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7Test.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.solr.tests; - -import org.testcontainers.junit.jupiter.Testcontainers; - -@Testcontainers(disabledWithoutDocker = true) -public class TikaPipesSolr7Test extends TikaPipesSolrTestBase { - - @Override - public boolean useZk() { - return false; - } - - @Override - public String getSolrImageName() { - return "solr:7"; - } -} diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7ZkTest.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7ZkTest.java deleted file mode 100644 index 875afe2236..0000000000 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr7ZkTest.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.pipes.solr.tests; - -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import org.junit.jupiter.api.BeforeAll; -import org.testcontainers.junit.jupiter.Testcontainers; - -import org.apache.tika.utils.SystemUtils; - -@Testcontainers(disabledWithoutDocker = true) -public class TikaPipesSolr7ZkTest extends TikaPipesSolr7Test { - - @BeforeAll - public static void setUp() { - assumeTrue( - SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX, - "zk test only works on linux (and not mac os x)"); - } - - - @Override - public boolean useZk() { - return true; - } - -} diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java index e04dc59fbf..8b87db6a17 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr8Test.java @@ -23,7 +23,7 @@ public class TikaPipesSolr8Test extends TikaPipesSolrTestBase { @Override public String getSolrImageName() { - return "solr:8.11.2"; + return "solr:8.11.4"; } @Override diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java index d4f7b7382e..f60cdd696d 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/java/org/apache/tika/pipes/solr/tests/TikaPipesSolr9Test.java @@ -23,7 +23,7 @@ public class TikaPipesSolr9Test extends TikaPipesSolrTestBase { @Override public String getSolrImageName() { - return "solr:9.3"; + return "solr:9.7.0"; } @Override diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml index c88e66e99e..4b85ea2fea 100644 --- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml +++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/log4j2.xml @@ -18,14 +18,14 @@ specific language governing permissions and limitations under the License. --> - + - + diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index d88443c0a7..8838ffeac7 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -428,8 +428,7 @@ 2.0.16 1.4 1.1.10.7 - - 8.11.4 + 9.7.0 6.1.14 3.46.1.3 2.1.0 diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java index a139b9b7be..d5655ddd90 100644 --- a/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java +++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/main/java/org/apache/tika/pipes/emitter/solr/SolrEmitter.java @@ -25,9 +25,11 @@ import java.util.Map; import java.util.Optional; import java.util.UUID; +import java.util.concurrent.TimeUnit; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.Http2SolrClient; import org.apache.solr.client.solrj.impl.LBHttpSolrClient; import org.apache.solr.client.solrj.request.UpdateRequest; import org.apache.solr.client.solrj.response.UpdateResponse; @@ -46,6 +48,7 @@ import org.apache.tika.pipes.emitter.AbstractEmitter; import org.apache.tika.pipes.emitter.EmitData; import org.apache.tika.pipes.emitter.TikaEmitterException; +import org.apache.tika.utils.StringUtils; public class SolrEmitter extends AbstractEmitter implements Initializable { @@ -307,9 +310,21 @@ public void setEmbeddedFileFieldName(String embeddedFileFieldName) { @Override public void initialize(Map params) throws TikaConfigException { if (solrUrls == null || solrUrls.isEmpty()) { + //TODO -- there's more that we need to pass through, including ssl etc. + Http2SolrClient.Builder http2SolrClientBuilder = new Http2SolrClient.Builder(); + if (!StringUtils.isBlank(httpClientFactory.getUserName())) { + http2SolrClientBuilder.withBasicAuthCredentials(httpClientFactory.getUserName(), httpClientFactory.getPassword()); + } + http2SolrClientBuilder + .withRequestTimeout(httpClientFactory.getRequestTimeout(), TimeUnit.MILLISECONDS) + .withConnectionTimeout(connectionTimeout, TimeUnit.MILLISECONDS); + + + Http2SolrClient http2SolrClient = http2SolrClientBuilder.build(); solrClient = new CloudSolrClient.Builder(solrZkHosts, Optional.ofNullable(solrZkChroot)) - .withConnectionTimeout(connectionTimeout).withSocketTimeout(socketTimeout) - .withHttpClient(httpClientFactory.build()).build(); + .withHttpClient(http2SolrClient) + .build(); + } else { solrClient = new LBHttpSolrClient.Builder().withConnectionTimeout(connectionTimeout) .withSocketTimeout(socketTimeout).withHttpClient(httpClientFactory.build()) diff --git a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java index 9ecead289b..3681cdeb9b 100644 --- a/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java +++ b/tika-pipes/tika-pipes-iterators/tika-pipes-iterator-solr/src/main/java/org/apache/tika/pipes/pipesiterator/solr/SolrPipesIterator.java @@ -24,12 +24,14 @@ import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CloudSolrClient; +import org.apache.solr.client.solrj.impl.Http2SolrClient; import org.apache.solr.client.solrj.impl.LBHttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; @@ -49,6 +51,7 @@ import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.pipesiterator.PipesIterator; +import org.apache.tika.utils.StringUtils; /** * Iterates through results from a Solr query. @@ -232,11 +235,21 @@ protected void enqueue() throws InterruptedException, IOException, TimeoutExcept private SolrClient createSolrClient() throws TikaConfigException { if (solrUrls == null || solrUrls.isEmpty()) { + //TODO -- there's more that we need to pass through, including ssl etc. + Http2SolrClient.Builder http2SolrClientBuilder = new Http2SolrClient.Builder(); + if (!StringUtils.isBlank(httpClientFactory.getUserName())) { + http2SolrClientBuilder.withBasicAuthCredentials(httpClientFactory.getUserName(), httpClientFactory.getPassword()); + } + http2SolrClientBuilder + .withRequestTimeout(httpClientFactory.getRequestTimeout(), TimeUnit.MILLISECONDS) + .withConnectionTimeout(connectionTimeout, TimeUnit.MILLISECONDS); + + + Http2SolrClient http2SolrClient = http2SolrClientBuilder.build(); return new CloudSolrClient.Builder(solrZkHosts, Optional.ofNullable(solrZkChroot)) - .withHttpClient(httpClientFactory.build()) - .withConnectionTimeout(connectionTimeout) - .withSocketTimeout(socketTimeout) + .withHttpClient(http2SolrClient) .build(); + } return new LBHttpSolrClient.Builder() .withConnectionTimeout(connectionTimeout)