diff --git a/conf/domain-suffixes.xml.template b/conf/domain-suffixes.xml.template
deleted file mode 100644
index 096309b90a..0000000000
--- a/conf/domain-suffixes.xml.template
+++ /dev/null
@@ -1,4428 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
- INFRASTRUCTURE
-
- (from http://en.wikipedia.org/wiki/.root)
- vrsn-end-of-zone-marker-dummy-record.root is a domain name
- listed in the DNS root zone as a diagnostic marker, whose
- presence demonstrates the root zone was not truncated upon
- loading by a root nameserver. It could be argued it represents
- a top-level domain of .root, although technically no such
- delegation exists.
-
-
-
-
- INFRASTRUCTURE
-
- (from http://en.wikipedia.org/wiki/.arpa) .arpa is an Internet
- top-level domain (TLD) used exclusively for
- Internet-infrastructure purposes. It does not function as a
- normal TLD where websites are registered, but rather as a
- meta-TLD used to look up addresses, and for other purposes.
-
-
-
-
-
-
-
- SPONSORED
- for the air transport industry
-
-
-
- UNSPONSORED
- for business use
-
-
-
- SPONSORED
- for Catalan language/culture
-
-
-
- UNSPONSORED
-
- for commercial organizations, but unrestricted
-
-
-
-
- SPONSORED
- for cooperatives
-
-
-
- UNSPONSORED
- 1.0
-
- for post-secondary educational establishments
-
-
-
-
- UNSPONSORED
-
- for governments and their agencies in the United States
-
-
-
-
- UNSPONSORED
-
- for informational sites, but unrestricted
-
-
-
-
- UNSPONSORED
-
- for international organizations established by treaty
-
-
-
-
- SPONSORED
- for employment-related sites
-
-
-
- UNSPONSORED
- for the US military
-
-
-
- SPONSORED
- for sites catering to mobile devices
-
-
-
- SPONSORED
- for museums
-
-
-
- UNSPONSORED
- for families and individuals
-
-
-
- UNSPONSORED
-
- originally for network infrastructures, now unrestricted
-
-
-
-
- UNSPONSORED
-
- originally for organizations not clearly falling within the
- other gTLDs, now unrestricted
-
-
-
-
- SPONSORED
- for certain professions
-
-
-
- SPONSORED
-
- for travel agents, airlines, hoteliers, tourism bureaus, etc.
-
-
-
-
-
-
- STARTUP
- for the Asian community
-
-
-
- PROPOSED
- for postal services
-
-
-
- STARTUP
-
- for services involving connections between the telephone
- network and the Internet
-
-
-
-
- PROPOSED
- for geographically related sites
-
-
-
- PROPOSED
- for Galicia, a country within Spain
-
-
-
- PROPOSED
- for Wales, a country within the UK
-
-
-
- PROPOSED
- for Scotland, a country within the UK
-
-
-
- PROPOSED
- for websites designed for children
-
-
-
- PROPOSED
- for websites designed for children
-
-
-
- PROPOSED
- http://en.wikipedia.org/wiki/.mail
-
-
-
- PROPOSED
- For Web sites of all sorts
-
-
-
- PROPOSED
- For Adult entertainment sites
-
-
-
-
- DELETED
-
- for NATO sites and operations. Replaced by .int
-
-
-
-
-
- PSEUDO_DOMAIN
-
- identifying a hostname not connected directly to the Internet,
- but a bitnet network
-
-
-
-
- PSEUDO_DOMAIN
-
- identifying a hostname not connected directly to the Internet,
- but a csnet network
-
-
-
-
- PSEUDO_DOMAIN
-
- identifying a hostname not connected directly to the Internet,
- but a bitnet network
-
-
-
-
- PSEUDO_DOMAIN
-
- .local is a pseudo top-level domain used by Apple, Inc.'s
- Bonjour protocol.
-
-
-
-
- PSEUDO_DOMAIN
- alias of .local
-
-
-
- PSEUDO_DOMAIN
-
- designates an anonymous or pseudonymous address reachable via
- the Tor network.
-
-
-
-
-
-
- Ascension Island
-
-
-
- Andorra
-
-
-
- United Arab Emirates
-
-
-
- Afghanistan
-
-
-
- Antigua and Barbuda
-
-
-
- Anguilla
-
-
-
- Albania
-
-
-
- Armenia
-
-
-
- Netherlands Antilles
-
-
-
- Angola
-
-
-
- Antarctica
-
-
-
- Argentina
-
-
-
- American Samoa
-
-
-
- Austria
-
-
-
- Australia
-
-
-
- Aruba
-
-
-
- Aland Islands
-
-
-
- Azerbaijan
-
-
-
- Bosnia and Herzegovina
-
-
-
- Barbados
-
-
-
- Bangladesh
-
-
-
- Belgium
-
-
-
- Burkina Faso
-
-
-
- Bulgaria
-
-
-
- Bahrain
-
-
-
- Burundi
-
-
-
- Benin
-
-
-
- Bermuda
-
-
-
- Brunei
-
-
-
- Bolivia
-
-
-
- Brazil
-
-
-
- Bahamas
-
-
-
- Bhutan
-
-
-
- Burma
- NOT_IN_USE
-
- not in use since re-naming of country to Myanmar, see .mm
-
-
-
-
- Bouvet Island
- NOT_IN_USE
- not in use; no registrations
-
-
-
- Botswana
-
-
-
- Belarus
-
-
-
- Belize
-
-
-
- Canada
-
-
-
- Cocos Keeling Islands
-
-
-
- Democratic Republic of the Congo
- formerly .zr - Zaire
-
-
-
- Central African Republic
-
-
-
- Republic of the Congo
-
-
-
- Switzerland
-
-
-
- Côte d'Ivoire
- Ivory Coast
-
-
-
- Cook Islands
-
-
-
- Chile
-
-
-
- Cameroon
-
-
-
- People s Republic of China
-
-
-
- Colombia
-
-
-
- Costa Rica
-
-
-
- Serbia and Montenegro
- DELETED
-
- formerly .yu - Yugoslavia; description: on June 3, 2006,
- Montenegro declared independence, thus dissolving the state
- union) (.cs code not assigned; no DNS) (.cs code previously
- used for Czechoslovakia
-
-
-
-
- Cuba
-
-
-
- Cape Verde
-
-
-
- Christmas Island
-
-
-
- Cyprus
-
-
-
- Czech Republic
-
-
-
- German Democratic Republic(East Germany)
- DELETED
- deleted in 1990
-
-
-
- Germany
-
-
-
- Djibouti
-
-
-
- Denmark
-
-
-
- Dominica
-
-
-
- Dominican Republic
-
-
-
- Algeria
-
-
-
- Ecuador
-
-
-
- Estonia
-
-
-
- Egypt
-
-
-
- Western Sahara
- NOT_IN_USE
- not assigned; no DNS
-
-
-
- Eritrea
-
-
-
- Spain
-
-
-
- Ethiopia
-
-
-
- European Union
-
- code "exceptionally reserved" by ISO 3166-1
-
-
-
-
- Finland
-
-
-
- Fiji
-
-
-
- Falkland Islands
-
-
-
- Federated States of Micronesia
-
-
-
- Faroe Islands
-
-
-
- France
-
-
-
- Gabon
-
-
-
- United Kingdom
-
- Reserved domain by IANA; deprecated – see .uk
-
-
-
-
- Grenada
-
-
-
- Georgia
-
-
-
- French Guiana
-
-
-
- Guernsey
-
-
-
- Ghana
-
-
-
- Gibraltar
-
-
-
- Greenland
-
-
-
- Gambia
-
-
-
- Guinea
-
-
-
- Guadeloupe
-
-
-
- Equatorial Guinea
-
-
-
- Greece
-
-
-
- South Georgia and the South Sandwich Islands
-
-
-
- Guatemala
-
-
-
- Guam
-
-
-
- Guinea Bissau
-
-
-
- Guyana
-
-
-
- Hong Kong
-
-
-
- Heard Island and McDonald Islands
-
-
-
- Honduras
-
-
-
- Croatia
-
-
-
- Haiti
-
-
-
- Hungary
-
-
-
- Indonesia
-
-
-
- Ireland
-
-
-
- Israel
-
-
-
- Isle of Man
-
-
-
- India
-
-
-
- British Indian Ocean Territory
-
-
-
- Iraq
-
-
-
- Iran
-
-
-
- Iceland
-
-
-
- Italy
-
-
-
- Jersey
-
-
-
- Jamaica
-
-
-
- Jordan
-
-
-
- Japan
-
-
-
- Kenya
-
-
-
- Kyrgyzstan
-
-
-
- Cambodia
-
-
-
- Kiribati
-
-
-
- Comoros
-
-
-
- Saint Kitts and Nevis
-
-
-
- North Korea
- NOT_IN_USE
-
- not assigned; no DNS
-
-
-
- South Korea
-
-
-
- Kuwait
-
-
-
- Cayman Islands
-
-
-
- Kazakhstan
-
-
-
- Laos
-
-
-
- Lebanon
-
-
-
- Saint Lucia
-
-
-
- Liechtenstein
-
-
-
- Sri Lanka
-
-
-
- Liberia
-
-
-
- Lesotho
-
-
-
- Lithuania
-
-
-
- Luxembourg
-
-
-
- Latvia
-
-
-
- Libya
-
-
-
- Morocco
-
-
-
- Monaco
-
-
-
- Moldova
-
-
-
- Montenegro
-
-
-
- Madagascar
-
-
-
- Marshall Islands
-
-
-
- Republic of Macedonia
-
-
-
- Mali
-
-
-
- Myanmar
- formerly .bu - Burma
-
-
-
- Mongolia
-
-
-
- Macau
-
-
-
- Northern Mariana Islands
-
-
-
- Martinique
-
-
-
- Mauritania
-
-
-
- Montserrat
-
-
-
- Malta
-
-
-
- Mauritius
-
-
-
- Maldives
-
-
-
- Malawi
-
-
-
- Mexico
-
-
-
- Malaysia
-
-
-
- Mozambique
-
-
-
- Namibia
-
-
-
- New Caledonia
-
-
-
- Niger
-
-
-
- Norfolk Island
-
-
-
- Nigeria
-
-
-
- Nicaragua
-
-
-
- Netherlands
-
-
-
- Norway
-
-
-
- Nepal
-
-
-
- Nauru
-
-
-
- Niue
-
-
-
- New Zealand
-
-
-
- Oman
-
-
-
- Panama
-
-
-
- Peru
-
-
-
- French Polynesia
-
-
-
- Papua New Guinea
-
-
-
- Philippines
-
-
-
- Pakistan
-
-
-
- Poland
-
-
-
- Saint Pierre and Miquelon
-
-
-
- Pitcairn Islands
-
-
-
- Puerto Rico
-
-
-
- Palestinian territories
-
-
-
- Portugal
-
-
-
- Palau
-
-
-
- Paraguay
-
-
-
- Qatar
-
-
-
- Réunion
-
-
-
- Romania
-
-
-
- Serbia
-
-
-
- Russia
-
-
-
- Rwanda
-
-
-
- Saudi Arabia
-
-
-
- Solomon Islands
-
-
-
- Seychelles
-
-
-
- Sudan
-
-
-
- Sweden
-
-
-
- Singapore
-
-
-
- Saint Helena
-
-
-
- Slovenia
-
-
-
- Svalbard and Jan Mayen Islands
- NOT_IN_USE
- not in use; no registrations
-
-
-
- Slovakia
-
-
-
- Sierra Leone
-
-
-
- San Marino
-
-
-
- Senegal
-
-
-
- Somalia
-
-
-
- Suriname
-
-
-
- São Tomé and Príncipe
-
-
-
- Soviet Union
- DELETED
-
- deprecated; being phased out; code "transitionally reserved"
- by ISO 3166-1
-
-
-
-
- El Salvador
-
-
-
- Syria
-
-
-
- Swaziland
-
-
-
- Turks and Caicos Islands
-
-
-
- Chad
-
-
-
- French Southern Territories
-
-
-
- Togo
-
-
-
- Thailand
-
-
-
- Tajikistan
-
-
-
- Tokelau
-
-
-
- East Timor
- formerly .tp
-
-
-
- Turkmenistan
-
-
-
- Tunisia
-
-
-
- Tonga
-
-
-
- East Timor
- DELETED
-
- deprecated - use .tl; code "transitionally reserved" by ISO
- 3166-1
-
-
-
-
- Turkey
-
-
-
- Trinidad and Tobago
-
-
-
- Tuvalu
-
-
-
- Republic of China
- Taiwan
-
-
-
- Tanzania
-
-
-
- Ukraine
-
-
-
- Uganda
-
-
-
- United Kingdom
-
- code "exceptionally reserved" by ISO 3166-1 (see also .gb)
-
-
-
-
- United States Minor Outlying Islands
- DELETED
- see http://en.wikipedia.org/wiki/.um
-
-
-
- United States
-
-
-
- Uruguay
-
-
-
- Uzbekistan
-
-
-
- Vatican City
-
-
-
- Saint Vincent and the Grenadines
-
-
-
- Venezuela
-
-
-
- British Virgin Islands
-
-
-
- United States Virgin Islands
-
-
-
- Vietnam
-
-
-
- Vanuatu
-
-
-
- Wallis and Futuna
-
-
-
- Samoa
- formerly Western Samoa
-
-
-
- Yemen
-
-
-
- Mayotte
-
-
-
- Yugoslavia
-
- subsequently renamed Serbia and Montenegro (code officially
- replaced by .cs (see above) but still used; code
- "transitionally reserved" by ISO 3166-1)
-
-
-
-
- South Africa
-
-
-
- Zambia
-
-
-
- Zaire
- DELETED
- replaced by .cd
-
-
-
- Zimbabwe
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- DELETED
-
-
- DELETED
-
-
- DELETED
-
-
- DELETED
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/conf/domain-suffixes.xsd b/conf/domain-suffixes.xsd
deleted file mode 100644
index 67c9bd0e7e..0000000000
--- a/conf/domain-suffixes.xsd
+++ /dev/null
@@ -1,130 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/default.properties b/default.properties
index 47041f465c..f8047baa67 100644
--- a/default.properties
+++ b/default.properties
@@ -130,7 +130,6 @@ plugins.scoring=\
org.apache.nutch.scoring.opic*:\
org.apache.nutch.scoring.orphan*:\
org.apache.nutch.scoring.similarity*:\
- org.apache.nutch.scoring.tld*:\
org.apache.nutch.scoring.urlmeta*\
org.apache.nutch.scoring.metadata*
diff --git a/src/bin/nutch b/src/bin/nutch
index b3e0a256bf..0b55388c68 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -269,7 +269,7 @@ elif [ "$COMMAND" = "filterchecker" ] ; then
elif [ "$COMMAND" = "normalizerchecker" ] ; then
CLASS=org.apache.nutch.net.URLNormalizerChecker
elif [ "$COMMAND" = "domainstats" ] ; then
- CLASS=org.apache.nutch.util.domain.DomainStatistics
+ CLASS=org.apache.nutch.util.DomainStatistics
elif [ "$COMMAND" = "protocolstats" ] ; then
CLASS=org.apache.nutch.util.ProtocolStatusStatistics
elif [ "$COMMAND" = "crawlcomplete" ] ; then
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/DomainStatistics.java
similarity index 97%
rename from src/java/org/apache/nutch/util/domain/DomainStatistics.java
rename to src/java/org/apache/nutch/util/DomainStatistics.java
index 1843c424d1..0a74f02310 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/DomainStatistics.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.nutch.util.domain;
+package org.apache.nutch.util;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
@@ -38,9 +38,6 @@
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -192,7 +189,7 @@ public void map(Text urlText, CrawlDatum datum, Context context)
out = URLUtil.getDomainName(url);
break;
case MODE_SUFFIX:
- out = URLUtil.getDomainSuffix(url).getDomain();
+ out = URLUtil.getDomainSuffix(url);
break;
case MODE_TLD:
out = URLUtil.getTopLevelDomainName(url);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 9ec0d35a8b..0cfce1c650 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -23,8 +23,7 @@
import java.util.Locale;
import java.util.regex.Pattern;
-import org.apache.nutch.util.domain.DomainSuffix;
-import org.apache.nutch.util.domain.DomainSuffixes;
+import crawlercommons.domains.EffectiveTldFinder;
/** Utility class for URL analysis */
public class URLUtil {
@@ -85,72 +84,92 @@ static URL fixPureQueryTargets(URL base, String target)
.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
/**
- * Get the domain name of the url. The domain name of a url is the
- * substring of the url's hostname, w/o subdomain names. As an example
+ * Get the domain name of the URL. The domain name of a URL is the substring
+ * of the URL's hostname, w/o subdomain names. As an example
*
- * getDomainName(new URL(http://lucene.apache.org/))
+ * getDomainName(new URL("https://lucene.apache.org/"))
*
* will return
* apache.org
- * @param url A input {@link URL} to extract the domain from
+ *
+ * Special cases:
+ *
+ * - if the hostname does not end in a valid domain suffix, the entire
+ * hostname is returned.
+ * - for URLs without a hostname, an empty string is returned.
+ *
+ *
+ * Valid domain suffixes are taken from the
+ * https://publicsuffix.org/list/public_suffix_list.dat and are compared
+ * using
+ * crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are
+ * used. Because EffectiveTldFinder loads the public suffix list as file
+ * "effective_tld_names.dat" from the Java classpath, it's possible to use the
+ * a specific version of the public suffix list (e.g., the most recent one) by
+ * placing the public suffix list with the name "effective_tld_names.dat" in
+ * Nutch's conf/
folder.
+ *
+ * See {@link EffectiveTldFinder#getAssignedDomain(String, boolean, boolean)}
+ *
+ * @param url
+ * input {@link URL} to extract the domain from
* @return the domain name string
- * */
+ */
public static String getDomainName(URL url) {
- DomainSuffixes tlds = DomainSuffixes.getInstance();
String host = url.getHost();
- // it seems that java returns hostnames ending with .
- if (host.endsWith("."))
+
+ // strip trailing dot in host names
+ if (host.length() > 0 && host.charAt(host.length() - 1) == '.') {
host = host.substring(0, host.length() - 1);
- if (IP_PATTERN.matcher(host).matches())
- return host;
-
- int index = 0;
- String candidate = host;
- for (; index >= 0;) {
- index = candidate.indexOf('.');
- String subCandidate = candidate.substring(index + 1);
- if (tlds.isDomainSuffix(subCandidate)) {
- return candidate;
- }
- candidate = subCandidate;
}
- return candidate;
+ return EffectiveTldFinder.getAssignedDomain(host, false, true);
}
/**
- * Returns the domain name of the url. The domain name of a url is the
- * substring of the url's hostname, w/o subdomain names. As an example
+ * Returns the domain name of the URL. The domain name of a URL is the
+ * substring of the URL's hostname, w/o subdomain names. As an example
*
- * getDomainName(conf, new http://lucene.apache.org/)
+ * getDomainName("https://lucene.apache.org/")
*
* will return
- * apache.org
- * @param url A input url string to extract the domain from
+ * apache.org
+ *
+ * See {@link #getDomainName(URL)} for more information.
+ *
+ * @param url
+ * input URL string to extract the domain from
* @return the domain name
- * @throws MalformedURLException if the input url is malformed
+ * @throws MalformedURLException
+ * if the input URL is malformed
*/
public static String getDomainName(String url) throws MalformedURLException {
return getDomainName(new URL(url));
}
/**
- * Returns the top level domain name of the url. The top level domain name of
- * a url is the substring of the url's hostname, w/o subdomain names. As an
+ * Returns the top-level domain name of the URL. The top-level domain name of
+ * a URL is the substring of the URL's hostname, w/o subdomain names. As an
* example
*
- * getTopLevelDomainName(conf, new http://lucene.apache.org/)
+ * getTopLevelDomainName(new URL("https://www.example.co.uk/"))
*
* will return
- * org
+ * uk
*
- * @param url A input {@link URL} to extract the top
- * level domain name from
- * @return the top level domain name
- * @throws MalformedURLException if the input url is malformed
+ * In case of internationalized top-level domains, the ASCII representation is
+ * returned.
+ *
+ * @param url
+ * input {@link URL} to extract the top-level domain name from
+ * @return the top-level domain name or null if there is none
*/
- public static String getTopLevelDomainName(URL url)
- throws MalformedURLException {
- String suffix = getDomainSuffix(url).toString();
+ public static String getTopLevelDomainName(URL url) {
+ String suffix = getDomainSuffix(url);
+ if (suffix == null) {
+ return null;
+ }
int idx = suffix.lastIndexOf(".");
if (idx != -1) {
return suffix.substring(idx + 1);
@@ -160,19 +179,23 @@ public static String getTopLevelDomainName(URL url)
}
/**
- * Returns the top level domain name of the url. The top level domain name of
- * a url is the substring of the url's hostname, w/o subdomain names. As an
+ * Returns the top-level domain name of the URL. The top-level domain name of
+ * a URL is the substring of the URL's hostname, w/o subdomain names. As an
* example
*
- * getTopLevelDomainName(conf, new http://lucene.apache.org/)
+ * getTopLevelDomainName("https://www.example.co.uk/")
*
* will return
- * org
+ * uk
*
- * @param url A input url string to extract the top
- * level domain name from
- * @return the top level domain name
- * @throws MalformedURLException if the input url is malformed
+ * In case of internationalized top-level domains, the ASCII representation is
+ * returned.
+ *
+ * @param url
+ * input URL string to extract the top-level domain name from
+ * @return the top-level domain name or null if there is none
+ * @throws MalformedURLException
+ * if the input URL is malformed
*/
public static String getTopLevelDomainName(String url)
throws MalformedURLException {
@@ -180,12 +203,16 @@ public static String getTopLevelDomainName(String url)
}
/**
- * Returns whether the given urls have the same domain name. As an example,
- * isSameDomain(new URL("http://lucene.apache.org")
- * , new URL("http://people.apache.org/"))
- *
will return true.
- * @param url1 first {@link URL} to compare domain name
- * @param url2 second {@link URL} to compare domain name
+ * Returns whether the given URLs have the same domain name. As an example,
+ *
+ * isSameDomain(new URL("http://lucene.apache.org")
+ * , new URL("http://people.apache.org/"))
+ *
will return true.
+ *
+ * @param url1
+ * first {@link URL} to compare domain name
+ * @param url2
+ * second {@link URL} to compare domain name
*
* @return true if the domain names are equal
*/
@@ -194,14 +221,19 @@ public static boolean isSameDomainName(URL url1, URL url2) {
}
/**
- * Returns whether the given urls have the same domain name. As an example,
- * isSameDomain("http://lucene.apache.org"
- * ,"http://people.apache.org/")
- *
will return true.
- * @param url1 first url string to compare domain name
- * @param url2 second url string to compare domain name
+ * Returns whether the given URLs have the same domain name. As an example,
+ *
+ * isSameDomain("http://lucene.apache.org"
+ * ,"http://people.apache.org/")
+ *
will return true.
+ *
+ * @param url1
+ * first URL string to compare domain name
+ * @param url2
+ * second URL string to compare domain name
* @return true if the domain names are equal
- * @throws MalformedURLException if either of the input urls are malformed
+ * @throws MalformedURLException
+ * if any of the input URLs are malformed
*/
public static boolean isSameDomainName(String url1, String url2)
throws MalformedURLException {
@@ -209,39 +241,48 @@ public static boolean isSameDomainName(String url1, String url2)
}
/**
- * Returns the {@link DomainSuffix} corresponding to the last public part of
- * the hostname
- * @param url a {@link URL} to extract the domain suffix from
- * @return a {@link org.apache.nutch.util.domain.DomainSuffix}
+ * Returns the public suffix corresponding to the last public part of the
+ * hostname.
+ *
+ * In case of internationalized domain suffixes, the ASCII representation is
+ * returned. For the URL https://www.taiuru.māori.nz/
the suffix
+ * xn--mori-qsa.nz
is returned.
+ *
+ * @param url
+ * a {@link URL} to extract the domain suffix from
+ * @return the domain suffix or null if there is none
*/
- public static DomainSuffix getDomainSuffix(URL url) {
- DomainSuffixes tlds = DomainSuffixes.getInstance();
+ public static String getDomainSuffix(URL url) {
String host = url.getHost();
- if (IP_PATTERN.matcher(host).matches())
- return null;
- int index = 0;
- String candidate = host;
- for (; index >= 0;) {
- index = candidate.indexOf('.');
- String subCandidate = candidate.substring(index + 1);
- DomainSuffix d = tlds.get(subCandidate);
- if (d != null) {
- return d;
- }
- candidate = subCandidate;
+ // strip trailing dot in host names
+ if (host.length() > 0 && host.charAt(host.length() - 1) == '.') {
+ host = host.substring(0, host.length() - 1);
+ }
+
+ EffectiveTldFinder.EffectiveTLD suffix = EffectiveTldFinder.getEffectiveTLD(host, true);
+ if (suffix != null) {
+ return suffix.getDomain();
}
+
return null;
}
/**
- * Returns the {@link DomainSuffix} corresponding to the last public part of
- * the hostname
- * @param url a {@link URL} to extract the domain suffix from
- * @return a {@link org.apache.nutch.util.domain.DomainSuffix}
- * @throws MalformedURLException if the input url string is malformed
+ * Returns the domain suffix corresponding to the last public part of the
+ * hostname.
+ *
+ * In case of internationalized domain suffixes, the ASCII representation is
+ * returned. For the URL https://www.taiuru.māori.nz/
the suffix
+ * xn--mori-qsa.nz
is returned.
+ *
+ * @param url
+ * a {@link URL} to extract the domain suffix from
+ * @return the domain suffix or null if there is none
+ * @throws MalformedURLException
+ * if the input URL string is malformed
*/
- public static DomainSuffix getDomainSuffix(String url)
+ public static String getDomainSuffix(String url)
throws MalformedURLException {
return getDomainSuffix(new URL(url));
}
@@ -422,8 +463,7 @@ public static String chooseRepr(String src, String dst, boolean temp) {
}
/**
- * Returns the lowercased hostname for the URL or null if the URL is not well-formed
- * formed.
+ * Returns the lowercased hostname for the URL or null if the URL is not well-formed.
*
* @param url
* The URL to check.
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
deleted file mode 100644
index 05162aaf7a..0000000000
--- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.util.domain;
-
-/**
- * This class represents the last part of the host name, which is operated by
- * authoritives, not individuals. This information is needed to find the domain
- * name of a host. The domain name of a host is defined to be the last part
- * before the domain suffix, w/o subdomain names. As an example the domain name
- * of
- * http://lucene.apache.org/
- *
- * is apache.org
- * This class holds three fields, domain field represents the
- * suffix (such as "co.uk") boost is a float for boosting score
- * of url's with this suffix status field represents domain's
- * status
- *
- * @author Enis Soztutar <enis.soz.nutch@gmail.com>
- * @see TopLevelDomain for info please see conf/domain-suffixes.xml
- */
-public class DomainSuffix {
-
- /**
- * Enumeration of the status of the tld. Please see domain-suffixes.xml.
- */
- public enum Status {
- INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
- };
-
- private String domain;
- private Status status;
- private float boost;
-
- public static final float DEFAULT_BOOST = 1.0f;
- public static final Status DEFAULT_STATUS = Status.IN_USE;
-
- public DomainSuffix(String domain, Status status, float boost) {
- this.domain = domain;
- this.status = status;
- this.boost = boost;
- }
-
- public DomainSuffix(String domain) {
- this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
- }
-
- public String getDomain() {
- return domain;
- }
-
- public Status getStatus() {
- return status;
- }
-
- public float getBoost() {
- return boost;
- }
-
- @Override
- public String toString() {
- return domain;
- }
-}
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
deleted file mode 100644
index 455f367126..0000000000
--- a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.util.domain;
-
-import java.io.InputStream;
-import java.lang.invoke.MethodHandles;
-import java.util.HashMap;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.util.StringUtils;
-
-/**
- * Storage class for DomainSuffix
objects Note: this class is
- * singleton
- *
- * @author Enis Soztutar <enis.soz.nutch@gmail.com>
- */
-public class DomainSuffixes {
- private static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
-
- private HashMap domains = new HashMap<>();
-
- private static DomainSuffixes instance;
-
- /** private ctor */
- private DomainSuffixes() {
- String file = "domain-suffixes.xml";
-
- try (InputStream input = this.getClass().getClassLoader()
- .getResourceAsStream(file)) {
- new DomainSuffixesReader().read(this, input);
- } catch (Exception ex) {
- LOG.warn(StringUtils.stringifyException(ex));
- }
- }
-
- /**
- * Singleton instance, lazy instantination
- *
- * @return returns the domain suffix instance
- */
- public static DomainSuffixes getInstance() {
- if (instance == null) {
- instance = new DomainSuffixes();
- }
- return instance;
- }
-
- void addDomainSuffix(DomainSuffix tld) {
- domains.put(tld.getDomain(), tld);
- }
-
- /**
- * Return whether the extension is a registered domain entry
- * @param extension a String extension
- * @return true if input is a registered domain entry, false otherwise
- */
- public boolean isDomainSuffix(String extension) {
- return domains.containsKey(extension);
- }
-
- /**
- * Return the {@link DomainSuffix} object for the extension, if extension is a
- * top level domain returned object will be an instance of
- * {@link TopLevelDomain}
- *
- * @param extension
- * of the domain
- * @return {@link DomainSuffix}
- */
- public DomainSuffix get(String extension) {
- return domains.get(extension);
- }
-
-}
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java b/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
deleted file mode 100644
index 69e212dccf..0000000000
--- a/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.util.domain;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.lang.invoke.MethodHandles;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.nutch.util.domain.DomainSuffix.Status;
-import org.apache.nutch.util.domain.TopLevelDomain.Type;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-/**
- * For parsing xml files containing domain suffix definitions. Parsed xml files
- * should validate against domain-suffixes.xsd
- *
- * @author Enis Soztutar <enis.soz.nutch@gmail.com>
- */
-class DomainSuffixesReader {
-
- private static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
-
- void read(DomainSuffixes tldEntries, InputStream input) throws IOException {
- try {
-
- DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
- factory.setIgnoringComments(true);
- DocumentBuilder builder = factory.newDocumentBuilder();
- Document document = builder.parse(new InputSource(input));
-
- Element root = document.getDocumentElement();
-
- if (root != null && root.getTagName().equals("domains")) {
-
- Element tlds = (Element) root.getElementsByTagName("tlds").item(0);
- Element suffixes = (Element) root.getElementsByTagName("suffixes")
- .item(0);
-
- // read tlds
- readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds")
- .item(0));
- readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds")
- .item(0));
- readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds")
- .item(0));
-
- readSuffixes(tldEntries, suffixes);
- } else {
- throw new IOException("xml file is not valid");
- }
- } catch (ParserConfigurationException ex) {
- LOG.warn(StringUtils.stringifyException(ex));
- throw new IOException(ex.getMessage());
- } catch (SAXException ex) {
- LOG.warn(StringUtils.stringifyException(ex));
- throw new IOException(ex.getMessage());
- }
- }
-
- void readITLDs(DomainSuffixes tldEntries, Element el) {
- NodeList children = el.getElementsByTagName("tld");
- for (int i = 0; i < children.getLength(); i++) {
- tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
- Type.INFRASTRUCTURE));
- }
- }
-
- void readGTLDs(DomainSuffixes tldEntries, Element el) {
- NodeList children = el.getElementsByTagName("tld");
- for (int i = 0; i < children.getLength(); i++) {
- tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
- Type.GENERIC));
- }
- }
-
- void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
- NodeList children = el.getElementsByTagName("tld");
- for (int i = 0; i < children.getLength(); i++) {
- tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i)));
- }
- }
-
- TopLevelDomain readGTLD(Element el, Type type) {
- String domain = el.getAttribute("domain");
- Status status = readStatus(el);
- float boost = readBoost(el);
- return new TopLevelDomain(domain, type, status, boost);
- }
-
- TopLevelDomain readCCTLD(Element el) throws IOException {
- String domain = el.getAttribute("domain");
- Status status = readStatus(el);
- float boost = readBoost(el);
- String countryName = readCountryName(el);
- return new TopLevelDomain(domain, status, boost, countryName);
- }
-
- /** read optional field status */
- Status readStatus(Element el) {
- NodeList list = el.getElementsByTagName("status");
- if (list == null || list.getLength() == 0)
- return DomainSuffix.DEFAULT_STATUS;
- return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
- }
-
- /** read optional field boost */
- float readBoost(Element el) {
- NodeList list = el.getElementsByTagName("boost");
- if (list == null || list.getLength() == 0)
- return DomainSuffix.DEFAULT_BOOST;
- return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
- }
-
- /**
- * read field countryname
- */
- String readCountryName(Element el) throws IOException {
- NodeList list = el.getElementsByTagName("country");
- if (list == null || list.getLength() == 0)
- throw new IOException("Country name should be given");
- return list.item(0).getNodeValue();
- }
-
- void readSuffixes(DomainSuffixes tldEntries, Element el) {
- NodeList children = el.getElementsByTagName("suffix");
- for (int i = 0; i < children.getLength(); i++) {
- tldEntries.addDomainSuffix(readSuffix((Element) children.item(i)));
- }
- }
-
- DomainSuffix readSuffix(Element el) {
- String domain = el.getAttribute("domain");
- Status status = readStatus(el);
- float boost = readBoost(el);
- return new DomainSuffix(domain, status, boost);
- }
-
-}
diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
deleted file mode 100644
index 2e9cddb5b3..0000000000
--- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.util.domain;
-
-/**
- * (From wikipedia) A top-level domain (TLD) is the last part of an Internet
- * domain name; that is, the letters which follow the final dot of any domain
- * name. For example, in the domain name www.website.com
, the
- * top-level domain is com
.
- *
- * @author Enis Soztutar <enis.soz.nutch@gmail.com>
- *
- * @see iana.org
- *
- * @see
- * Top-level_domain
- */
-public class TopLevelDomain extends DomainSuffix {
-
- public enum Type {
- INFRASTRUCTURE, GENERIC, COUNTRY
- };
-
- private Type type;
- private String countryName = null;
-
- public TopLevelDomain(String domain, Type type, Status status, float boost) {
- super(domain, status, boost);
- this.type = type;
- }
-
- public TopLevelDomain(String domain, Status status, float boost,
- String countryName) {
- super(domain, status, boost);
- this.type = Type.COUNTRY;
- this.countryName = countryName;
- }
-
- public Type getType() {
- return type;
- }
-
- /**
- * Returns the country name if TLD is Country Code TLD
- *
- * @return country name or null
- */
- public String getCountryName() {
- return countryName;
- }
-
-}
diff --git a/src/java/org/apache/nutch/util/domain/package-info.java b/src/java/org/apache/nutch/util/domain/package-info.java
deleted file mode 100644
index 6a799a9f1d..0000000000
--- a/src/java/org/apache/nutch/util/domain/package-info.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Classes for domain name analysis. for information refer to
- * following urls :
- *
- */
-package org.apache.nutch.util.domain;
diff --git a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
index 296124d56f..0637e9136f 100644
--- a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
+++ b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -30,10 +30,15 @@
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.util.URLUtil;
-import org.apache.nutch.util.domain.DomainSuffix;
/**
- * Adds the top-level domain extensions to the index
+ * Adds the public suffix (aka. effective top-level domain) to the index using
+ * the field name "tld".
+ *
+ *
+ * For the URL https://www.example.co.uk/
the public suffix is
+ * co.uk
. See also {@link URLUtil#getDomainSuffix(URL)}.
+ *
*/
public class TLDIndexingFilter implements IndexingFilter {
private static final Logger LOG = LoggerFactory
@@ -47,9 +52,9 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
try {
URL url = new URL(urlText.toString());
- DomainSuffix d = URLUtil.getDomainSuffix(url);
+ String domain = URLUtil.getDomainSuffix(url);
- doc.add("tld", d.getDomain());
+ doc.add("tld", domain);
} catch (Exception ex) {
LOG.warn(ex.toString());
diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
deleted file mode 100644
index 5f3080912c..0000000000
--- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.scoring.tld;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.scoring.AbstractScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
-import org.apache.nutch.util.domain.DomainSuffix;
-import org.apache.nutch.util.domain.DomainSuffixes;
-
-/**
- * Scoring filter to boost top-level domains (TLDs).
- */
-public class TLDScoringFilter extends AbstractScoringFilter {
-
- private DomainSuffixes tldEntries;
-
- public TLDScoringFilter() {
- tldEntries = DomainSuffixes.getInstance();
- }
-
- @Override
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
-
- NutchField tlds = doc.getField("tld");
- float boost = 1.0f;
-
- if (tlds != null) {
- for (Object tld : tlds.getValues()) {
- DomainSuffix entry = tldEntries.get(tld.toString());
- if (entry != null)
- boost *= entry.getBoost();
- }
- }
- return initScore * boost;
- }
-
-}
diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java
deleted file mode 100644
index 6ab837301c..0000000000
--- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/** Top Level Domain Scoring plugin. */
-package org.apache.nutch.scoring.tld;
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index c68750c0c4..9b0e9776de 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -33,7 +33,6 @@
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.URLUtil;
-import org.apache.nutch.util.domain.DomainSuffix;
/**
*
@@ -163,13 +162,9 @@ public String filter(String url) {
try {
// match for suffix, domain, and host in that order. more general will
// override more specific
- String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+ String domain = URLUtil.getDomainName(url);
String host = URLUtil.getHost(url);
- String suffix = null;
- DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
- if (domainSuffix != null) {
- suffix = domainSuffix.getDomain();
- }
+ String suffix = URLUtil.getDomainSuffix(url);
if (domainSet.contains(suffix) || domainSet.contains(domain)
|| domainSet.contains(host)) {
diff --git a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java
index 7b38bfca00..1e86426c76 100644
--- a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java
+++ b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java
@@ -33,7 +33,6 @@
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.URLUtil;
-import org.apache.nutch.util.domain.DomainSuffix;
/**
*
@@ -161,13 +160,9 @@ public String filter(String url) {
try {
// match for suffix, domain, and host in that order. more general will
// override more specific
- String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+ String domain = URLUtil.getDomainName(url);
String host = URLUtil.getHost(url);
- String suffix = null;
- DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
- if (domainSuffix != null) {
- suffix = domainSuffix.getDomain();
- }
+ String suffix = URLUtil.getDomainSuffix(url);
if (domainSet.contains(suffix) || domainSet.contains(domain)
|| domainSet.contains(host)) {
diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index eaaf7d0c77..f8a0a88766 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -32,6 +32,10 @@ public void testGetDomainName() throws Exception {
url = new URL("http://lucene.apache.org/nutch");
Assert.assertEquals("apache.org", URLUtil.getDomainName(url));
+ // hostname with trailing dot
+ url = new URL("https://lucene.apache.org./nutch");
+ Assert.assertEquals("apache.org", URLUtil.getDomainName(url));
+
url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
Assert.assertEquals("wikipedia.org", URLUtil.getDomainName(url));
@@ -47,15 +51,19 @@ public void testGetDomainName() throws Exception {
url = new URL("http://www.example.co.uk.com");
Assert.assertEquals("uk.com", URLUtil.getDomainName(url));
- // "nn" is not a tld
+ // "nn" is not a public suffix
url = new URL("http://example.com.nn");
- Assert.assertEquals("nn", URLUtil.getDomainName(url));
+ Assert.assertEquals("example.com.nn", URLUtil.getDomainName(url));
url = new URL("http://");
Assert.assertEquals("", URLUtil.getDomainName(url));
+ /*
+ * "xyz" is an ICANN suffix since 2014, see
+ * https://www.iana.org/domains/root/db/xyz.html
+ */
url = new URL("http://www.edu.tr.xyz");
- Assert.assertEquals("xyz", URLUtil.getDomainName(url));
+ Assert.assertEquals("tr.xyz", URLUtil.getDomainName(url));
url = new URL("http://www.example.c.se");
Assert.assertEquals("example.c.se", URLUtil.getDomainName(url));
@@ -71,6 +79,11 @@ public void testGetDomainName() throws Exception {
// test non-ascii
url = new URL("http://www.example.商業.tw");
Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url));
+
+ // test URL without host/authority
+ url = new URL("file:/path/index.html");
+ Assert.assertNotNull(URLUtil.getDomainName(url));
+ Assert.assertEquals("", URLUtil.getDomainName(url));
}
@Test
@@ -78,50 +91,86 @@ public void testGetDomainSuffix() throws Exception {
URL url = null;
url = new URL("http://lucene.apache.org/nutch");
- Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("org", URLUtil.getDomainSuffix(url));
+
+ // hostname with trailing dot
+ url = new URL("https://lucene.apache.org./nutch");
+ Assert.assertEquals("org", URLUtil.getDomainSuffix(url));
url = new URL("http://140.211.11.130/foundation/contributing.html");
Assert.assertNull(URLUtil.getDomainSuffix(url));
url = new URL("http://www.example.co.uk:8080/index.html");
- Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url));
url = new URL("http://com");
- Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("com", URLUtil.getDomainSuffix(url));
url = new URL("http://www.example.co.uk.com");
- Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("com", URLUtil.getDomainSuffix(url));
- // "nn" is not a tld
+ // "nn" is not a public suffix
url = new URL("http://example.com.nn");
Assert.assertNull(URLUtil.getDomainSuffix(url));
url = new URL("http://");
Assert.assertNull(URLUtil.getDomainSuffix(url));
+ /*
+ * "xyz" is an ICANN suffix since 2014, see
+ * https://www.iana.org/domains/root/db/xyz.html
+ */
url = new URL("http://www.edu.tr.xyz");
- Assert.assertNull(URLUtil.getDomainSuffix(url));
+ Assert.assertEquals("xyz", URLUtil.getDomainSuffix(url));
url = new URL("http://subdomain.example.edu.tr");
- Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url));
url = new URL("http://subdomain.example.presse.fr");
- Assert.assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("fr", URLUtil.getDomainSuffix(url));
url = new URL("http://subdomain.example.presse.tr");
- Assert.assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("tr", URLUtil.getDomainSuffix(url));
// plc.co.im is listed as a domain suffix
url = new URL("http://www.example.plc.co.im");
- Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url));
// 2000.hu is listed as a domain suffix
url = new URL("http://www.example.2000.hu");
- Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url));
// test non-ascii
url = new URL("http://www.example.商業.tw");
- Assert.assertEquals("商業.tw", URLUtil.getDomainSuffix(url).getDomain());
+ Assert.assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
+ }
+
+ @Test
+ public void testGetTopLevelDomain() throws Exception {
+ URL url = null;
+
+ url = new URL("http://lucene.apache.org/nutch");
+ Assert.assertEquals("org", URLUtil.getTopLevelDomainName(url));
+
+ // hostname with trailing dot
+ url = new URL("https://lucene.apache.org./nutch");
+ Assert.assertEquals("org", URLUtil.getTopLevelDomainName(url));
+
+ url = new URL("http://140.211.11.130/foundation/contributing.html");
+ Assert.assertNull(URLUtil.getTopLevelDomainName(url));
+
+ url = new URL("http://www.example.co.uk:8080/index.html");
+ Assert.assertEquals("uk", URLUtil.getTopLevelDomainName(url));
+
+ // "nn" is not a public suffix
+ url = new URL("http://example.com.nn");
+ Assert.assertNull(URLUtil.getTopLevelDomainName(url));
+
+ url = new URL("http://");
+ Assert.assertNull(URLUtil.getTopLevelDomainName(url));
+
+ url = new URL("http://nic.삼성/");
+ Assert.assertEquals("xn--cg4bki", URLUtil.getTopLevelDomainName(url));
}
@Test
@@ -270,7 +319,7 @@ public void testToASCII() throws Exception {
@Test
public void testFileProtocol() throws Exception {
- // keep one single slash NUTCH-XXX
+ // keep one single slash NUTCH-1483
Assert.assertEquals("file:/path/file.html",
URLUtil.toASCII("file:/path/file.html"));
Assert.assertEquals("file:/path/file.html",