From 6c1f37974d06b2a4727b8a82b1293fb2bf75517f Mon Sep 17 00:00:00 2001 From: Richard Lehane Date: Sat, 27 May 2023 12:00:45 +0200 Subject: [PATCH] implements globs for container matching see https://github.com/digital-preservation/pronom/issues/10 --- internal/containermatcher/container.go | 82 +++++++++++++++++++++----- internal/containermatcher/identify.go | 13 ++++ pkg/config/siegfried.go | 2 +- 3 files changed, 82 insertions(+), 15 deletions(-) diff --git a/internal/containermatcher/container.go b/internal/containermatcher/container.go index 345c258e8..68a8ca900 100644 --- a/internal/containermatcher/container.go +++ b/internal/containermatcher/container.go @@ -18,6 +18,8 @@ import ( "encoding/binary" "errors" "fmt" + "path/filepath" + "strings" "github.com/richardlehane/siegfried/internal/bytematcher" "github.com/richardlehane/siegfried/internal/bytematcher/frames" @@ -144,28 +146,42 @@ type ContainerMatcher struct { ctype startIndexes []int // added to hits - these place all container matches in a single slice conType containerType - nameCTest map[string]*cTest - parts []int // corresponds with each signature: represents the number of CTests for each sig + nameCTest map[string]*cTest // map of literal paths to ctests + globs []string // corresponds with globCtests + globCtests []*cTest // + parts []int // corresponds with each signature: represents the number of CTests for each sig priorities *priority.Set extension string entryBufs *siegreader.Buffers } func loadCM(ls *persist.LoadSaver) *ContainerMatcher { - return &ContainerMatcher{ + ct := &ContainerMatcher{ startIndexes: ls.LoadInts(), conType: containerType(ls.LoadTinyUInt()), nameCTest: loadCTests(ls), - parts: ls.LoadInts(), - priorities: priority.Load(ls), - extension: ls.LoadString(), + globs: ls.LoadStrings(), + } + gcts := make([]*cTest, ls.LoadSmallInt()) + for i := range gcts { + gcts[i] = loadCTest(ls) } + ct.globCtests = gcts + ct.parts = ls.LoadInts() + ct.priorities = priority.Load(ls) + ct.extension = ls.LoadString() + return ct } func (c *ContainerMatcher) save(ls *persist.LoadSaver) { ls.SaveInts(c.startIndexes) ls.SaveTinyUInt(int(c.conType)) saveCTests(ls, c.nameCTest) + ls.SaveStrings(c.globs) + ls.SaveSmallInt(len(c.globCtests)) + for _, v := range c.globCtests { + saveCTest(ls, v) + } ls.SaveInts(c.parts) c.priorities.Save(ls) ls.SaveString(c.extension) @@ -176,6 +192,7 @@ func (c *ContainerMatcher) String() string { str += fmt.Sprintf("Type: %d\n", c.conType) str += fmt.Sprintf("Priorities: %v\n", c.priorities) str += fmt.Sprintf("Parts: %v\n", c.parts) + str += fmt.Sprintf("%d literal tests, %d glob tests\n", len(c.nameCTest), len(c.globCtests)) for k, v := range c.nameCTest { str += "-----------\n" str += fmt.Sprintf("Name: %v\n", k) @@ -187,6 +204,17 @@ func (c *ContainerMatcher) String() string { str += "Bytematcher:\n" + v.bm.String() } } + for i, v := range c.globs { + str += "-----------\n" + str += fmt.Sprintf("Glob: %v\n", v) + str += fmt.Sprintf("Satisfied: %v\n", c.globCtests[i].satisfied) + str += fmt.Sprintf("Unsatisfied: %v\n", c.globCtests[i].unsatisfied) + if c.globCtests[i].bm == nil { + str += "Bytematcher: None\n" + } else { + str += "Bytematcher:\n" + c.globCtests[i].bm.String() + } + } return str } @@ -240,7 +268,25 @@ func (c *ContainerMatcher) addSignature(nameParts []string, sigParts []frames.Si return errors.New("container matcher: nameParts and sigParts must be equal") } c.parts = append(c.parts, len(nameParts)) +outer: for i, nm := range nameParts { + if nm != "[Content_Types].xml" && strings.ContainsAny(nm, "*?[]") { + // is glob pattern is valid + if _, err := filepath.Match(nm, ""); err == nil { + // do we already have this glob? + for i, v := range c.globs { + if nm == v { + c.globCtests[i].add(sigParts[i], len(c.parts)-1) + continue outer + } + } + c.globs = append(c.globs, nm) + ct := &cTest{} + ct.add(sigParts[i], len(c.parts)-1) + c.globCtests = append(c.globCtests, ct) + continue + } + } ct, ok := c.nameCTest[nm] if !ok { ct = &cTest{} @@ -263,11 +309,7 @@ func loadCTests(ls *persist.LoadSaver) map[string]*cTest { ret := make(map[string]*cTest) l := ls.LoadSmallInt() for i := 0; i < l; i++ { - ret[ls.LoadString()] = &cTest{ - satisfied: ls.LoadInts(), - unsatisfied: ls.LoadInts(), - bm: bytematcher.Load(ls), - } + ret[ls.LoadString()] = loadCTest(ls) } return ret } @@ -276,12 +318,24 @@ func saveCTests(ls *persist.LoadSaver, ct map[string]*cTest) { ls.SaveSmallInt(len(ct)) for k, v := range ct { ls.SaveString(k) - ls.SaveInts(v.satisfied) - ls.SaveInts(v.unsatisfied) - bytematcher.Save(v.bm, ls) + saveCTest(ls, v) + } +} + +func loadCTest(ls *persist.LoadSaver) *cTest { + return &cTest{ + satisfied: ls.LoadInts(), + unsatisfied: ls.LoadInts(), + bm: bytematcher.Load(ls), } } +func saveCTest(ls *persist.LoadSaver, ct *cTest) { + ls.SaveInts(ct.satisfied) + ls.SaveInts(ct.unsatisfied) + bytematcher.Save(ct.bm, ls) +} + func (ct *cTest) add(s frames.Signature, t int) { if s == nil { ct.satisfied = append(ct.satisfied, t) diff --git a/internal/containermatcher/identify.go b/internal/containermatcher/identify.go index 42cb3335d..b4f13d474 100644 --- a/internal/containermatcher/identify.go +++ b/internal/containermatcher/identify.go @@ -131,9 +131,22 @@ func (c *ContainerMatcher) identify(n string, rdr Reader, res chan core.Result, } id := c.newIdentifier(len(c.parts), hints...) var err error +outer: for err = rdr.Next(); err == nil; err = rdr.Next() { ct, ok := c.nameCTest[rdr.Name()] if !ok { + for i, glob := range c.globs { + if m, _ := filepath.Match(glob, rdr.Name()); m { + if config.Debug() { + fmt.Fprintf(config.Out(), "{Glob match (%s) - %s (container %d))}\n", glob, rdr.Name(), c.conType) + } + // process hits returns true if we can stop, otherwise possible other globs may match + // so we keep trying remaining globs + if c.processHits(c.globCtests[i].identify(c, id, rdr, rdr.Name()), id, c.globCtests[i], rdr.Name(), res) { + break outer + } + } + } continue } if config.Debug() { diff --git a/pkg/config/siegfried.go b/pkg/config/siegfried.go index d1f7a7db8..47e233d83 100644 --- a/pkg/config/siegfried.go +++ b/pkg/config/siegfried.go @@ -47,7 +47,7 @@ var siegfried = struct { checkpoint int64 userAgent string }{ - version: [3]int{1, 10, 1}, + version: [3]int{1, 11, 0}, signature: "default.sig", conf: "sf.conf", magic: []byte{'s', 'f', 0x00, 0xFF},