feat(JS-2477): recognize x-rechnung PDF

levigo · Aug 19, 2024 · b23f587 · b23f587
1 parent e7d28d8
commit b23f587
Show file tree

Hide file tree

Showing 7 changed files with 659 additions and 59 deletions.
diff --git a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java
@@ -1,8 +1,11 @@
 package org.jadice.filetype.matchers;
 
+import static org.jadice.filetype.matchers.XMLMatcher.X_RECHNUNG_KEY;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.Map.Entry;
 
@@ -32,6 +35,7 @@
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.jadice.filetype.Context;
 import org.jadice.filetype.database.MimeTypeAction;
+import org.jadice.filetype.io.MemoryInputStream;
 import org.jadice.filetype.io.SeekableInputStream;
 import org.jadice.filetype.pdfutil.PDFBoxSignatureUtil;
 import org.slf4j.Logger;
@@ -85,7 +89,7 @@ public boolean matches(final Context context) {
       try (PDDocument document = PDDocument.load(sis)) {
         context.setProperty(MimeTypeAction.KEY, PDF_MIME_TYPE);
 
-        Map<String, Object> pdfDetails = new HashMap<String, Object>();
+        Map<String, Object> pdfDetails = new HashMap<>();
         context.setProperty(DETAILS_KEY, pdfDetails);
 
         pdfDetails.put(NUMBER_OF_PAGES_KEY, Integer.valueOf(document.getNumberOfPages()));
@@ -101,6 +105,7 @@ public boolean matches(final Context context) {
         PDMetadata meta = catalog.getMetadata();
         if (null != meta) {
           provideXMPMetadata(pdfDetails, meta);
+          checkIfXRechnung(pdfDetails);
         }
 
         PDEncryption encryption = document.getEncryption();
@@ -273,6 +278,29 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
     }
   }
 
+  /**
+   * Checks if the PDF is an electronic invoice.
+   * 
+   * @param pdfDetails the map of PDF details with the metadata XML
+   */
+  private static void checkIfXRechnung(final Map<String, Object> pdfDetails) {
+    final Object metadata = pdfDetails.get(METADATA_KEY);
+    if (metadata instanceof String) {
+      try {
+        final XMLMatcher xmlMatcher = new XMLMatcher();
+        final Context xmlContext = new Context(
+            new MemoryInputStream(((String) metadata).getBytes(StandardCharsets.UTF_8)),
+            new HashMap<>(), null, Locale.ENGLISH, "");
+        final boolean isXRechnung = xmlMatcher.matches(xmlContext);
+        if (isXRechnung) {
+          pdfDetails.put(X_RECHNUNG_KEY, true);
+        }
+      } catch (IOException e) {
+        LOGGER.error("Failed to parse metadata XML", e);
+      }
+    }
+  }
+
   /**
    * Reads the whole stream to determine the length of it.
    *

diff --git a/src/test/java/TestVariousTypes.java b/src/test/java/TestVariousTypes.java
@@ -1,58 +1,118 @@
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.Map;
-
-import org.jadice.filetype.Analyzer;
-import org.jadice.filetype.AnalyzerException;
-import org.jadice.filetype.database.DescriptionAction;
-import org.jadice.filetype.database.ExtensionAction;
-import org.jadice.filetype.database.MimeTypeAction;
-import org.jadice.filetype.io.MemoryInputStream;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-class TestVariousTypes {
-
-  private static final Logger LOGGER = LoggerFactory.getLogger(TestVariousTypes.class);
-
-	private static Analyzer analyzer;
-
-	@BeforeAll
-	public static void createAnalyzer() throws AnalyzerException {
-		analyzer = Analyzer.getInstance("/magic.xml");
-	}
-
-  @Test
-  void testVariousTypes() throws IOException {
-    final File[] files = new File("src/test/resources/various_types").listFiles(
-        pathname -> pathname.isFile() && pathname.canRead());
-    assert files != null;
-
-    for (final File file : files) {
-      LOGGER.info("File: " + file);
-      final Map<String, Object> results = analyzer.analyze(file);
-      assertNotNull(results, file + " could not be analyzed");
-      assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing for " + file);
-      assertNotNull(results.get(DescriptionAction.KEY), "description missing for" + file);
-      // extension can be null
-      // assertNotNull(results.get(ExtensionAction.KEY), file + " could not be analyzed");
-      for (final Map.Entry<String, Object> e : results.entrySet())
-        LOGGER.info("   " + e.getKey() + "=" + e.getValue());
-      LOGGER.info("\n-------------------");
-    }
-  }
-
-  @Test
-  void testEmptyStream() throws Exception {
-	  Map<String, Object> results = analyzer.analyze(new MemoryInputStream(new byte[0]));
-	  assertNotNull(results, "empty stream could not be analyzed");
-	  assertEquals("text/plain", results.get(MimeTypeAction.KEY));
-	  assertEquals("txt", results.get(ExtensionAction.KEY));
-	  assertEquals("Binary data, ASCII Text Document", results.get(DescriptionAction.KEY));
-  }
-}
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+import static org.junit.jupiter.params.provider.Arguments.arguments;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Map;
+import java.util.stream.Stream;
+
+import org.jadice.filetype.Analyzer;
+import org.jadice.filetype.AnalyzerException;
+import org.jadice.filetype.database.DescriptionAction;
+import org.jadice.filetype.database.ExtensionAction;
+import org.jadice.filetype.database.MimeTypeAction;
+import org.jadice.filetype.io.MemoryInputStream;
+import org.jadice.filetype.matchers.PDFMatcher;
+import org.jadice.filetype.matchers.XMLMatcher;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+class TestVariousTypes {
+
+  private static final Logger LOGGER = LoggerFactory.getLogger(TestVariousTypes.class);
+
+  private static Analyzer analyzer;
+
+  @BeforeAll
+  public static void createAnalyzer() throws AnalyzerException {
+    analyzer = Analyzer.getInstance("/magic.xml");
+  }
+
+  @Test
+  void testVariousTypes() throws IOException {
+    final File[] files = new File("src/test/resources/various_types").listFiles(
+        pathname -> pathname.isFile() && pathname.canRead());
+    assert files != null;
+
+    for (final File file : files) {
+      LOGGER.info("File: " + file);
+      final Map<String, Object> results = analyzer.analyze(file);
+      assertNotNull(results, file + " could not be analyzed");
+      assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing for " + file);
+      assertNotNull(results.get(DescriptionAction.KEY), "description missing for" + file);
+      // extension can be null
+      // assertNotNull(results.get(ExtensionAction.KEY), file + " could not be analyzed");
+      printResult(results);
+    }
+  }
+
+  @Test
+  void testEmptyStream() throws Exception {
+    Map<String, Object> results = analyzer.analyze(new MemoryInputStream(new byte[0]));
+    assertNotNull(results, "empty stream could not be analyzed");
+    assertEquals("text/plain", results.get(MimeTypeAction.KEY));
+    assertEquals("txt", results.get(ExtensionAction.KEY));
+    assertEquals("Binary data, ASCII Text Document", results.get(DescriptionAction.KEY));
+  }
+
+  public static Stream<Arguments> dataProvider() {
+    return Stream.of(
+        arguments("/various_types/BASIC_Einfach.pdf", "application/pdf"),
+        arguments("/various_types/EN16931_Einfach.pdf", "application/pdf"),
+        arguments("/various_types/EN16931_Einfach.cii.xml", "application/xml;charset=UTF-8;x-rechnung=true"),
+        arguments("/various_types/EN16931_Einfach.ubl.xml", "application/xml;charset=UTF-8;x-rechnung=true"),
+        arguments("/various_types/ZUGFeRD-invoice_rabatte_3_abschlag_duepayableamount.xml", "application/xml;charset=UTF-8;x-rechnung=true")
+    );
+  }
+
+  @ParameterizedTest
+  @MethodSource("dataProvider")
+  void testXRechnung(String resource, String expectedMimeType) throws Exception {
+    final URL url = getClass().getResource(resource);
+    assertNotNull(url);
+    final File file = new File(url.toURI());
+    final Map<String, Object> results = analyzer.analyze(file);
+    assertNotNull(results, file + " could not be analyzed");
+    assertNotNull(results.get(MimeTypeAction.KEY), "mimeType missing");
+    assertEquals(expectedMimeType, results.get(MimeTypeAction.KEY), "wrong mimeType");
+    assertNotNull(results.get(DescriptionAction.KEY), "description missing");
+    assertNotNull(results.get(ExtensionAction.KEY), "could not be analyzed");
+    checkForDetails(results);
+    printResult(results);
+  }
+
+  private void checkForDetails(final Map<String, Object> results) {
+    final String mimeType = (String)results.get(MimeTypeAction.KEY);
+    switch (mimeType) {
+      case "application/pdf": ensureXRechnungIsTrue(results, PDFMatcher.DETAILS_KEY); break;
+      case "application/xml;charset=UTF-8;x-rechnung=true": ensureXRechnungIsTrue(results, XMLMatcher.DETAILS_KEY); break;
+      default: fail("unexpected mime type");
+    }
+  }
+
+  @SuppressWarnings("unchecked")
+  private void ensureXRechnungIsTrue(final Map<String, Object> results, final String detailsKey) {
+    final Object details = results.get(detailsKey);
+    assertNotNull(details, "details are missing");
+    final Map<String, Object> detailsMap = (Map<String, Object>) details;
+    final boolean isXRechnung = (Boolean)detailsMap.get(XMLMatcher.X_RECHNUNG_KEY);
+    assertTrue(isXRechnung, "x_rechnung should be true");
+  }
+
+
+  private static void printResult(final Map<String, Object> results) {
+    for (final Map.Entry<String, Object> e : results.entrySet()) {
+      LOGGER.info("   {}={}", e.getKey(), e.getValue());
+    }
+    LOGGER.info("\n-------------------");
+  }
+}
diff --git a/src/test/resources/various_types/BASIC_Einfach.pdf b/src/test/resources/various_types/BASIC_Einfach.pdf