Skip to content

Commit

Permalink
feat(JS-2477): recognize x-rechnung PDF
Browse files Browse the repository at this point in the history
  • Loading branch information
welschsn committed Aug 19, 2024
1 parent e7d28d8 commit b23f587
Show file tree
Hide file tree
Showing 7 changed files with 659 additions and 59 deletions.
30 changes: 29 additions & 1 deletion src/main/java/org/jadice/filetype/matchers/PDFMatcher.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package org.jadice.filetype.matchers;

import static org.jadice.filetype.matchers.XMLMatcher.X_RECHNUNG_KEY;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.Map.Entry;

Expand Down Expand Up @@ -32,6 +35,7 @@
import org.apache.pdfbox.text.PDFTextStripper;
import org.jadice.filetype.Context;
import org.jadice.filetype.database.MimeTypeAction;
import org.jadice.filetype.io.MemoryInputStream;
import org.jadice.filetype.io.SeekableInputStream;
import org.jadice.filetype.pdfutil.PDFBoxSignatureUtil;
import org.slf4j.Logger;
Expand Down Expand Up @@ -85,7 +89,7 @@ public boolean matches(final Context context) {
try (PDDocument document = PDDocument.load(sis)) {
context.setProperty(MimeTypeAction.KEY, PDF_MIME_TYPE);

Map<String, Object> pdfDetails = new HashMap<String, Object>();
Map<String, Object> pdfDetails = new HashMap<>();
context.setProperty(DETAILS_KEY, pdfDetails);

pdfDetails.put(NUMBER_OF_PAGES_KEY, Integer.valueOf(document.getNumberOfPages()));
Expand All @@ -101,6 +105,7 @@ public boolean matches(final Context context) {
PDMetadata meta = catalog.getMetadata();
if (null != meta) {
provideXMPMetadata(pdfDetails, meta);
checkIfXRechnung(pdfDetails);
}

PDEncryption encryption = document.getEncryption();
Expand Down Expand Up @@ -273,6 +278,29 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
}
}

/**
* Checks if the PDF is an electronic invoice.
*
* @param pdfDetails the map of PDF details with the metadata XML
*/
private static void checkIfXRechnung(final Map<String, Object> pdfDetails) {
final Object metadata = pdfDetails.get(METADATA_KEY);
if (metadata instanceof String) {
try {
final XMLMatcher xmlMatcher = new XMLMatcher();
final Context xmlContext = new Context(
new MemoryInputStream(((String) metadata).getBytes(StandardCharsets.UTF_8)),
new HashMap<>(), null, Locale.ENGLISH, "");
final boolean isXRechnung = xmlMatcher.matches(xmlContext);
if (isXRechnung) {
pdfDetails.put(X_RECHNUNG_KEY, true);
}
} catch (IOException e) {
LOGGER.error("Failed to parse metadata XML", e);
}
}
}

/**
* Reads the whole stream to determine the length of it.
*
Expand Down
176 changes: 118 additions & 58 deletions src/test/java/TestVariousTypes.java
Original file line number Diff line number Diff line change
@@ -1,58 +1,118 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;

import java.io.File;
import java.io.IOException;
import java.util.Map;

import org.jadice.filetype.Analyzer;
import org.jadice.filetype.AnalyzerException;
import org.jadice.filetype.database.DescriptionAction;
import org.jadice.filetype.database.ExtensionAction;
import org.jadice.filetype.database.MimeTypeAction;
import org.jadice.filetype.io.MemoryInputStream;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class TestVariousTypes {

private static final Logger LOGGER = LoggerFactory.getLogger(TestVariousTypes.class);

private static Analyzer analyzer;

@BeforeAll
public static void createAnalyzer() throws AnalyzerException {
analyzer = Analyzer.getInstance("/magic.xml");
}

@Test
void testVariousTypes() throws IOException {
final File[] files = new File("src/test/resources/various_types").listFiles(
pathname -> pathname.isFile() && pathname.canRead());
assert files != null;

for (final File file : files) {
LOGGER.info("File: " + file);
final Map<String, Object> results = analyzer.analyze(file);
assertNotNull(results, file + " could not be analyzed");
assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing for " + file);
assertNotNull(results.get(DescriptionAction.KEY), "description missing for" + file);
// extension can be null
// assertNotNull(results.get(ExtensionAction.KEY), file + " could not be analyzed");
for (final Map.Entry<String, Object> e : results.entrySet())
LOGGER.info(" " + e.getKey() + "=" + e.getValue());
LOGGER.info("\n-------------------");
}
}

@Test
void testEmptyStream() throws Exception {
Map<String, Object> results = analyzer.analyze(new MemoryInputStream(new byte[0]));
assertNotNull(results, "empty stream could not be analyzed");
assertEquals("text/plain", results.get(MimeTypeAction.KEY));
assertEquals("txt", results.get(ExtensionAction.KEY));
assertEquals("Binary data, ASCII Text Document", results.get(DescriptionAction.KEY));
}
}
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import static org.junit.jupiter.params.provider.Arguments.arguments;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Map;
import java.util.stream.Stream;

import org.jadice.filetype.Analyzer;
import org.jadice.filetype.AnalyzerException;
import org.jadice.filetype.database.DescriptionAction;
import org.jadice.filetype.database.ExtensionAction;
import org.jadice.filetype.database.MimeTypeAction;
import org.jadice.filetype.io.MemoryInputStream;
import org.jadice.filetype.matchers.PDFMatcher;
import org.jadice.filetype.matchers.XMLMatcher;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

class TestVariousTypes {

private static final Logger LOGGER = LoggerFactory.getLogger(TestVariousTypes.class);

private static Analyzer analyzer;

@BeforeAll
public static void createAnalyzer() throws AnalyzerException {
analyzer = Analyzer.getInstance("/magic.xml");
}

@Test
void testVariousTypes() throws IOException {
final File[] files = new File("src/test/resources/various_types").listFiles(
pathname -> pathname.isFile() && pathname.canRead());
assert files != null;

for (final File file : files) {
LOGGER.info("File: " + file);
final Map<String, Object> results = analyzer.analyze(file);
assertNotNull(results, file + " could not be analyzed");
assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing for " + file);
assertNotNull(results.get(DescriptionAction.KEY), "description missing for" + file);
// extension can be null
// assertNotNull(results.get(ExtensionAction.KEY), file + " could not be analyzed");
printResult(results);
}
}

@Test
void testEmptyStream() throws Exception {
Map<String, Object> results = analyzer.analyze(new MemoryInputStream(new byte[0]));
assertNotNull(results, "empty stream could not be analyzed");
assertEquals("text/plain", results.get(MimeTypeAction.KEY));
assertEquals("txt", results.get(ExtensionAction.KEY));
assertEquals("Binary data, ASCII Text Document", results.get(DescriptionAction.KEY));
}

public static Stream<Arguments> dataProvider() {
return Stream.of(
arguments("/various_types/BASIC_Einfach.pdf", "application/pdf"),
arguments("/various_types/EN16931_Einfach.pdf", "application/pdf"),
arguments("/various_types/EN16931_Einfach.cii.xml", "application/xml;charset=UTF-8;x-rechnung=true"),
arguments("/various_types/EN16931_Einfach.ubl.xml", "application/xml;charset=UTF-8;x-rechnung=true"),
arguments("/various_types/ZUGFeRD-invoice_rabatte_3_abschlag_duepayableamount.xml", "application/xml;charset=UTF-8;x-rechnung=true")
);
}

@ParameterizedTest
@MethodSource("dataProvider")
void testXRechnung(String resource, String expectedMimeType) throws Exception {
final URL url = getClass().getResource(resource);
assertNotNull(url);
final File file = new File(url.toURI());
final Map<String, Object> results = analyzer.analyze(file);
assertNotNull(results, file + " could not be analyzed");
assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing");
assertEquals(expectedMimeType, results.get(MimeTypeAction.KEY), "wrong mimeType");
assertNotNull(results.get(DescriptionAction.KEY), "description missing");
assertNotNull(results.get(ExtensionAction.KEY), "could not be analyzed");
checkForDetails(results);
printResult(results);
}

private void checkForDetails(final Map<String, Object> results) {
final String mimeType = (String)results.get(MimeTypeAction.KEY);
switch (mimeType) {
case "application/pdf": ensureXRechnungIsTrue(results, PDFMatcher.DETAILS_KEY); break;
case "application/xml;charset=UTF-8;x-rechnung=true": ensureXRechnungIsTrue(results, XMLMatcher.DETAILS_KEY); break;
default: fail("unexpected mime type");
}
}

@SuppressWarnings("unchecked")
private void ensureXRechnungIsTrue(final Map<String, Object> results, final String detailsKey) {
final Object details = results.get(detailsKey);
assertNotNull(details, "details are missing");
final Map<String, Object> detailsMap = (Map<String, Object>) details;
final boolean isXRechnung = (Boolean)detailsMap.get(XMLMatcher.X_RECHNUNG_KEY);
assertTrue(isXRechnung, "x_rechnung should be true");
}


private static void printResult(final Map<String, Object> results) {
for (final Map.Entry<String, Object> e : results.entrySet()) {
LOGGER.info(" {}={}", e.getKey(), e.getValue());
}
LOGGER.info("\n-------------------");
}
}
Binary file not shown.
Loading

0 comments on commit b23f587

Please sign in to comment.