Skip to content

Commit

Permalink
Merge branch 'NUTCH-3017', closes #793
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastian-nagel committed Nov 8, 2023
2 parents 9084912 + ac383fc commit adadc43
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
10 changes: 8 additions & 2 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1872,8 +1872,14 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<property>
<name>urlfilter.fast.file</name>
<value>fast-urlfilter.txt</value>
<description>Name of file on CLASSPATH containing regular expressions
used by urlfilter-fast (FastURLFilter) plugin.</description>
<description>Name of file containing rules and regular expressions
used by urlfilter-fast (FastURLFilter) plugin. If the filename
includes a scheme (for example, hdfs://) it is loaded using the
Hadoop FileSystem implementation supporting that scheme. If the
filename does not contain a scheme, the file is loaded from
CLASSPATH. If indicated by file extension (.gz, .bzip2, .zst),
the file is decompressed while reading using Hadoop-provided
compression codecs.</description>
</property>

<property>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,19 @@
import com.google.common.collect.Multimap;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.net.URLFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -115,7 +121,7 @@ public void setConf(Configuration conf) {
try {
reloadRules();
} catch (Exception e) {
LOG.error(e.getMessage());
LOG.error("Failed to load rules: {}", e.getMessage() );
throw new RuntimeException(e.getMessage(), e);
}
}
Expand Down Expand Up @@ -181,9 +187,24 @@ public String filter(String url) {

public void reloadRules() throws IOException {
String fileRules = conf.get(URLFILTER_FAST_FILE);
try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
reloadRules(reader);

InputStream is;

Path fileRulesPath = new Path(fileRules);
if (fileRulesPath.toUri().getScheme() != null) {
FileSystem fs = fileRulesPath.getFileSystem(conf);
is = fs.open(fileRulesPath);
} else {
is = conf.getConfResourceAsInputStream(fileRules);
}

CompressionCodec codec = new CompressionCodecFactory(conf)
.getCodec(fileRulesPath);
if (codec != null) {
is = codec.createInputStream(is);
}

reloadRules(new InputStreamReader(is));
}

private void reloadRules(Reader rules) throws IOException {
Expand Down

0 comments on commit adadc43

Please sign in to comment.