From 3495472ca9f3b54867e90eeb1bdfa64be36f731e Mon Sep 17 00:00:00 2001 From: cube Date: Tue, 15 Oct 2024 08:04:34 +0200 Subject: [PATCH] Unlock database when Injector finishes - regardless of result --- .gitignore | 1 + src/java/org/apache/nutch/crawl/Injector.java | 70 ++++++++++--------- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 8c521aa68..9cac3379c 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ lib/spotbugs-* ivy/dependency-check-ant/* .gradle* ivy/apache-rat-* +.vscode diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 0d3740eb4..314cf448d 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -358,6 +358,39 @@ public Injector(Configuration conf) { setConf(conf); } + private Job prepareJob(Configuration conf, Path urlDir, Path current, Path tempCrawlDb) throws IOException { + Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir); + job.setJarByClass(Injector.class); + job.setMapperClass(InjectMapper.class); + job.setReducerClass(InjectReducer.class); + job.setOutputFormatClass(MapFileOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(CrawlDatum.class); + job.setSpeculativeExecution(false); + + // set input and output paths of the job + MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); + FileStatus[] seedFiles = urlDir.getFileSystem(conf).listStatus(urlDir); + int numSeedFiles = 0; + for (FileStatus seedFile : seedFiles) { + if (seedFile.isFile()) { + MultipleInputs.addInputPath(job, seedFile.getPath(), + KeyValueTextInputFormat.class); + numSeedFiles++; + LOG.info("Injecting seed URL file {}", seedFile.getPath()); + } else { + LOG.warn("Skipped non-file input in {}: {}", urlDir, + seedFile.getPath()); + } + } + if (numSeedFiles == 0) { + LOG.error("No seed files to inject found in {}", urlDir); + throw new IllegalStateException("No seed files found"); + } + FileOutputFormat.setOutputPath(job, tempCrawlDb); + return job; + } + public void inject(Path crawlDb, Path urlDir) throws IOException, ClassNotFoundException, InterruptedException { inject(crawlDb, urlDir, false, false); @@ -400,40 +433,11 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, Path tempCrawlDb = new Path(crawlDb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); - // lock an existing crawldb to prevent multiple simultaneous updates - Path lock = CrawlDb.lock(conf, crawlDb, false); - // configure job - Job job = Job.getInstance(conf, "Nutch Injector: " + urlDir); - job.setJarByClass(Injector.class); - job.setMapperClass(InjectMapper.class); - job.setReducerClass(InjectReducer.class); - job.setOutputFormatClass(MapFileOutputFormat.class); - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(CrawlDatum.class); - job.setSpeculativeExecution(false); + Job job = prepareJob(conf, urlDir, current, tempCrawlDb); - // set input and output paths of the job - MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); - FileStatus[] seedFiles = urlDir.getFileSystem(getConf()).listStatus(urlDir); - int numSeedFiles = 0; - for (FileStatus seedFile : seedFiles) { - if (seedFile.isFile()) { - MultipleInputs.addInputPath(job, seedFile.getPath(), - KeyValueTextInputFormat.class); - numSeedFiles++; - LOG.info("Injecting seed URL file {}", seedFile.getPath()); - } else { - LOG.warn("Skipped non-file input in {}: {}", urlDir, - seedFile.getPath()); - } - } - if (numSeedFiles == 0) { - LOG.error("No seed files to inject found in {}", urlDir); - LockUtil.removeLockFile(fs, lock); - return; - } - FileOutputFormat.setOutputPath(job, tempCrawlDb); + // lock an existing crawldb to prevent multiple simultaneous updates + Path lock = CrawlDb.lock(conf, crawlDb, false); try { // run the job @@ -487,6 +491,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, LOG.error("Injector job failed: {}", e.getMessage()); NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs); throw e; + } finally { + LockUtil.removeLockFile(fs, lock); } }