Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TIKA-4330 -- Add a MetadataListFilter #2009

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,9 @@ private void handleRecursiveJson(URL url, OutputStream output) throws IOExceptio
}
JsonMetadataList.setPrettyPrinting(prettyPrint);
try (Writer writer = getOutputWriter(output, encoding)) {
JsonMetadataList.toJson(handler.getMetadataList(), writer);
List<Metadata> metadataList = handler.getMetadataList();
metadataList = config.getMetadataListFilter().filter(metadataList);
JsonMetadataList.toJson(metadataList, writer);
}
}

Expand Down
14 changes: 10 additions & 4 deletions tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.nio.file.Files;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.swing.Box;
Expand Down Expand Up @@ -152,9 +153,11 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener
* File chooser.
*/
private final JFileChooser chooser = new JFileChooser();
private final TikaConfig tikaConfig;

public TikaGUI(Parser parser) {
public TikaGUI(Parser parser, TikaConfig tikaConfig) {
super("Apache Tika");
this.tikaConfig = tikaConfig;
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

addMenuBar();
Expand Down Expand Up @@ -198,8 +201,9 @@ public static void main(String[] args) throws Exception {
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(() -> new TikaGUI(
new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256),
false)).setVisible(true));
new DigestingParser(new AutoDetectParser(finalConfig),
new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256),
false), finalConfig).setVisible(true));
}

private void addMenuBar() {
Expand Down Expand Up @@ -374,7 +378,9 @@ private void handleStream(InputStream input, Metadata md) throws Exception {
wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer);
List<Metadata> metadataList = recursiveParserWrapperHandler.getMetadataList();
metadataList = tikaConfig.getMetadataListFilter().filter(metadataList);
JsonMetadataList.toJson(metadataList, jsonBuffer);
setText(json, jsonBuffer.toString());
}
layout.show(cards, "metadata");
Expand Down
11 changes: 11 additions & 0 deletions tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.metadata.listfilter.MetadataListFilter;
import org.apache.tika.metadata.listfilter.NoOpListFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
Expand Down Expand Up @@ -104,6 +106,7 @@ public class TikaConfig {
private final EncodingDetector encodingDetector;
private final Renderer renderer;
private final MetadataFilter metadataFilter;
private final MetadataListFilter metadataListFilter;
private final AutoDetectParserConfig autoDetectParserConfig;

private static int MAX_JSON_STRING_FIELD_LENGTH = DEFAULT_MAX_JSON_STRING_FIELD_LENGTH;
Expand Down Expand Up @@ -177,6 +180,7 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException,
this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
this.metadataFilter = MetadataFilter.load(element, true);
this.metadataListFilter = MetadataListFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
this.serviceLoader = loader;
setMaxJsonStringFieldLength(element);
Expand Down Expand Up @@ -205,6 +209,7 @@ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException {
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
this.metadataListFilter = new NoOpListFilter();
this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT;
TIMES_INSTANTIATED.incrementAndGet();
}
Expand Down Expand Up @@ -251,6 +256,7 @@ public TikaConfig() throws TikaException, IOException {
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
this.metadataListFilter = new NoOpListFilter();
this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT;
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
Expand Down Expand Up @@ -278,6 +284,7 @@ public TikaConfig() throws TikaException, IOException {
this.executorService =
executorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.metadataFilter = MetadataFilter.load(element, true);
this.metadataListFilter = MetadataListFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
setMaxJsonStringFieldLength(element);
} catch (SAXException e) {
Expand Down Expand Up @@ -629,6 +636,10 @@ public MetadataFilter getMetadataFilter() {
return metadataFilter;
}

public MetadataListFilter getMetadataListFilter() {
return metadataListFilter;
}

public AutoDetectParserConfig getAutoDetectParserConfig() {
return autoDetectParserConfig;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata.listfilter;

import java.util.ArrayList;
import java.util.List;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;

public class CompositeMetadataListFilter extends MetadataListFilter {

//no longer final to allow for no arg initialization during serialization
private List<MetadataListFilter> filters;

public CompositeMetadataListFilter() {
filters = new ArrayList<>();
}
public CompositeMetadataListFilter(List<MetadataListFilter> filters) {
this.filters = filters;
}

public void setFilters(List<MetadataListFilter> filters) {
this.filters.clear();
this.filters.addAll(filters);
}

public List<MetadataListFilter> getFilters() {
return filters;
}

@Override
public List<Metadata> filter(List<Metadata> metadataList) throws TikaException {
for (MetadataListFilter filter : filters) {
metadataList = filter.filter(metadataList);
}
return metadataList;
}

@Override
public String toString() {
return "CompositeMetadataListFilter{" + "filters=" + filters + '}';
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata.listfilter;

import java.io.IOException;
import java.io.Serializable;
import java.util.List;

import org.w3c.dom.Element;

import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.MetadataFilter;

public abstract class MetadataListFilter extends ConfigBase implements Serializable {
/**
* Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter
* @param root
* @return
* @throws TikaConfigException
* @throws IOException
*/
public static MetadataListFilter load(Element root, boolean allowMissing) throws TikaConfigException,
IOException {
try {
return buildComposite("metadataListFilters", CompositeMetadataListFilter.class,
"metadataListFilter", MetadataFilter.class, root);
} catch (TikaConfigException e) {
if (allowMissing && e.getMessage().contains("could not find metadataListFilters")) {
return new NoOpListFilter();
}
throw e;
}
}
public abstract List<Metadata> filter(List<Metadata> metadataList) throws TikaException;
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,15 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.solr.tests;
package org.apache.tika.metadata.listfilter;

import org.testcontainers.junit.jupiter.Testcontainers;
import java.util.List;

@Testcontainers(disabledWithoutDocker = true)
public class TikaPipesSolr7Test extends TikaPipesSolrTestBase {
import org.apache.tika.metadata.Metadata;

public class NoOpListFilter extends MetadataListFilter {
@Override
public boolean useZk() {
return false;
}

@Override
public String getSolrImageName() {
return "solr:7";
public List<Metadata> filter(List<Metadata> metadataList) {
return metadataList;
}
}
39 changes: 31 additions & 8 deletions tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.listfilter.MetadataListFilter;
import org.apache.tika.metadata.listfilter.NoOpListFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
Expand Down Expand Up @@ -400,11 +402,8 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, parseData.getMetadataList());
//we need to apply the metadata filter after we pull out the stacktrace
MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataFilter();
}
filterMetadata(filter, parseData.getMetadataList());
filterMetadata(t, parseData.getMetadataList());
filterMetadataList(t, parseData);
ParseContext parseContext = t.getParseContext();
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class);
Expand Down Expand Up @@ -437,16 +436,35 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD
}
}

private void filterMetadata(MetadataFilter metadataFilter, List<Metadata> metadataList) {
private void filterMetadata(FetchEmitTuple t, List<Metadata> metadataList) {
MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataFilter();
}
for (Metadata m : metadataList) {
try {
metadataFilter.filter(m);
filter.filter(m);
} catch (TikaException e) {
LOG.warn("failed to filter metadata", e);
}
}
}

private void filterMetadataList(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) {
MetadataListFilter filter = t.getParseContext().get(MetadataListFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataListFilter();
}
if (filter instanceof NoOpListFilter) {
return;
}
try {
parseData.filter(filter);
} catch (TikaException e) {
LOG.warn("failed to filter metadata list", e);
}
}

private Fetcher getFetcher(FetchEmitTuple t) {
try {
return fetcherManager.getFetcher(t.getFetchKey().getFetcherName());
Expand Down Expand Up @@ -830,7 +848,8 @@ private void write(STATUS status) {
}

static class MetadataListAndEmbeddedBytes {
final List<Metadata> metadataList;

List<Metadata> metadataList;
final Optional<EmbeddedDocumentBytesHandler> embeddedDocumentBytesHandler;

public MetadataListAndEmbeddedBytes(List<Metadata> metadataList,
Expand All @@ -843,6 +862,10 @@ public List<Metadata> getMetadataList() {
return metadataList;
}

public void filter(MetadataListFilter filter) throws TikaException {
metadataList = filter.filter(metadataList);
}

public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() {
return embeddedDocumentBytesHandler.get();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,28 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.solr.tests;
package org.apache.tika.metadata.listfilter;

import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.util.List;

import org.junit.jupiter.api.BeforeAll;
import org.testcontainers.junit.jupiter.Testcontainers;

import org.apache.tika.utils.SystemUtils;

@Testcontainers(disabledWithoutDocker = true)
public class TikaPipesSolr6ZkTest extends TikaPipesSolr6Test {

@BeforeAll
public static void setUp() {
assumeTrue(
SystemUtils.IS_OS_UNIX && !SystemUtils.IS_OS_MAC_OSX,
"zk test only works on linux (and not mac os x)");
}
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;

public class AttachmentCountingListFilter extends MetadataListFilter {
@Override
public boolean useZk() {
return true;
public List<Metadata> filter(List<Metadata> metadataList) throws TikaException {
if (metadataList == null || metadataList.isEmpty()) {
return metadataList;
}
metadataList.get(0).set("X-TIKA:attachment_count", Integer.toString(metadataList.size() - 1));
return metadataList;
}

}
Loading