Skip to content

Commit

Permalink
Merge pull request #796 from DigitalPebble/NUTCH-3025
Browse files Browse the repository at this point in the history
[NUTCH-3025] urlfilter-fast to filter based on the length of the URL
  • Loading branch information
sebastian-nagel authored Nov 8, 2023
2 parents adadc43 + 49d85ea commit 7ad382d
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 4 deletions.
24 changes: 24 additions & 0 deletions conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,30 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
compression codecs.</description>
</property>

<property>
<name>urlfilter.fast.url.max.length</name>
<value>-1</value>
<description>Filters URLs based on their overall length.
The default value of -1 means that it is deactivated.
</description>
</property>

<property>
<name>urlfilter.fast.url.path.max.length</name>
<value>-1</value>
<description>Filters URLs based on the length of their path element.
The default value of -1 means that it is deactivated.
</description>
</property>

<property>
<name>urlfilter.fast.url.query.max.length</name>
<value>-1</value>
<description>Filters URLs based on the length of their query element.
The default value of -1 means that it is deactivated.
</description>
</property>

<property>
<name>urlfilter.order</name>
<value></value>
Expand Down
6 changes: 6 additions & 0 deletions src/plugin/urlfilter-fast/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,9 @@ the end of the line.

The rules file is defined via the property `urlfilter.fast.file`,
the default name is `fast-urlfilter.txt`.

In addition to this, the filter checks that the length of the path element of the URL and its query
done not exceed the values set in the properties `urlfilter.fast.url.path.max.length` and
`urlfilter.fast.url.query.max.length` if set. The overall length of the URL can also be used for
filtering through the config `urlfilter.fast.url.max.length`.

Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@
*
* The rules file is defined via the property <code>urlfilter.fast.file</code>,
* the default name is <code>fast-urlfilter.txt</code>.
*
* In addition, it can filter based on the length of the whole URL, its path element or
* its query element. See <code>urlfilter.fast.url.*</code> configurations.
*/
public class FastURLFilter implements URLFilter {

Expand All @@ -103,21 +106,45 @@ public class FastURLFilter implements URLFilter {

private Configuration conf;
public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
public static final String URLFILTER_FAST_MAX_LENGTH = "urlfilter.fast.url.max.length";
public static final String URLFILTER_FAST_PATH_MAX_LENGTH = "urlfilter.fast.url.path.max.length";
public static final String URLFILTER_FAST_QUERY_MAX_LENGTH = "urlfilter.fast.url.query.max.length";

private Multimap<String, Rule> hostRules = LinkedHashMultimap.create();
private Multimap<String, Rule> domainRules = LinkedHashMultimap.create();

/** Max allowed size of the path of a URL **/
private int maxLengthPath = -1;
/** Max allowed size of the query of a URL **/
private int maxLengthQuery = -1;
/** Max allowed size for the whole URL **/
private int maxLength = -1;

private static final Pattern CATCH_ALL_RULE = Pattern
.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");

public FastURLFilter() {}

/** Used by the tests so that the rules file doesn't have to be in the jar **/
FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
reloadRules(rules);
}

/** Used by the tests so that the rules file doesn't have to be in the jar AND
* we can set the conf for the length-based filtering **/
FastURLFilter(Reader rules, Configuration conf) throws IOException, PatternSyntaxException {
maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
reloadRules(rules);
}

@Override
public void setConf(Configuration conf) {
this.conf = conf;
maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
try {
reloadRules();
} catch (Exception e) {
Expand All @@ -134,6 +161,12 @@ public Configuration getConf() {
@Override
public String filter(String url) {

if (maxLength != -1 && url.length() > maxLength) {
LOG.debug("Rejected {} because URL length ({}) greater than limit {}", url,
url.length(), maxLength);
return null;
}

URL u;

try {
Expand All @@ -143,6 +176,22 @@ public String filter(String url) {
e.getMessage());
return null;
}

final String path = u.getPath();
if (maxLengthPath != -1 && path.length() > maxLengthPath)
{
LOG.debug("Rejected {} as path length {} is greater than {}", url,
path.length(), maxLengthPath);
return null;
}

final String query = u.getQuery();
if (maxLengthQuery != -1 && query != null && query.length() > maxLengthQuery)
{
LOG.debug("Rejected {} as query length {} is greater than {}", url,
query.length(), maxLengthQuery);
return null;
}

String hostname = u.getHost();

Expand Down Expand Up @@ -187,7 +236,6 @@ public String filter(String url) {

public void reloadRules() throws IOException {
String fileRules = conf.get(URLFILTER_FAST_FILE);

InputStream is;

Path fileRulesPath = new Path(fileRules);
Expand All @@ -200,11 +248,22 @@ public void reloadRules() throws IOException {

CompressionCodec codec = new CompressionCodecFactory(conf)
.getCodec(fileRulesPath);
if (codec != null) {
if (codec != null && is != null) {
is = codec.createInputStream(is);
}

reloadRules(new InputStreamReader(is));
try {
reloadRules(new InputStreamReader(is));
} catch (Exception e) {
String message = "Couldn't load the rules from " + fileRules;
LOG.error(message);
throw new IOException(message);
}
finally {
if (is != null) {
is.close();
}
}
}

private void reloadRules(Reader rules) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
import org.junit.Assert;
import org.junit.Test;


public class TestFastURLFilter extends RegexURLFilterBaseTest {

@Override
Expand Down Expand Up @@ -53,4 +54,39 @@ public void benchmark() {
bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
}

@Test
public void lengthQueryAndPath() throws Exception {
Configuration conf = new Configuration();
conf.setInt(FastURLFilter.URLFILTER_FAST_PATH_MAX_LENGTH, 50);
conf.setInt(FastURLFilter.URLFILTER_FAST_QUERY_MAX_LENGTH, 50);
// not interested in testing rules
URLFilter filter = new FastURLFilter(new StringReader(""), conf);

StringBuilder url = new StringBuilder("http://nutch.apache.org/");
for (int i = 0; i < 50; i++) {
url.append(i);
}
Assert.assertEquals(null, filter.filter(url.toString()));

url = new StringBuilder("http://nutch.apache.org/path?");
for (int i = 0; i < 50; i++) {
url.append(i);
}

Assert.assertEquals(null, filter.filter(url.toString()));
}

@Test
public void overalLengthTest() throws Exception {
Configuration conf = new Configuration();
conf.setInt(FastURLFilter.URLFILTER_FAST_MAX_LENGTH, 100);
// not interested in testing rules
URLFilter filter = new FastURLFilter(new StringReader(""), conf);

StringBuilder url = new StringBuilder("http://nutch.apache.org/");
for (int i = 0; i < 500; i++) {
url.append(i);
}
Assert.assertEquals(null, filter.filter(url.toString()));
}
}

0 comments on commit 7ad382d

Please sign in to comment.